Skip to content

Commit d5fbf98

Browse files
authored
Fix #62 allow complex numpy/pandas columns like strings and datetimes (#69)
* add dataframe tests * working on string types * finalize issue #62 * adjust failing test
1 parent 6547642 commit d5fbf98

File tree

5 files changed

+241
-25
lines changed

5 files changed

+241
-25
lines changed

monetdbe/_cffi.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import logging
66
from pathlib import Path
77
from re import compile, DOTALL
8-
from typing import Optional, Any, Dict, Tuple, Callable, Type
8+
from typing import Optional, Any, Dict, Tuple, Callable
99

1010
import numpy as np
1111

@@ -83,21 +83,21 @@ def check_error(msg: ffi.CData) -> None:
8383

8484

8585
# format: monetdb type: (cast name, converter function, numpy type, monetdb null value)
86-
type_map: Dict[Any, Tuple[str, Optional[Callable], Optional[Type], Optional[Any]]] = {
86+
type_map: Dict[Any, Tuple[str, Optional[Callable], np.dtype, Optional[Any]]] = {
8787
lib.monetdbe_bool: ("bool", bool, np.dtype(np.bool), None),
8888
lib.monetdbe_int8_t: ("int8_t", None, np.dtype(np.int8), np.iinfo(np.int8).min),
8989
lib.monetdbe_int16_t: ("int16_t", None, np.dtype(np.int16), np.iinfo(np.int16).min),
9090
lib.monetdbe_int32_t: ("int32_t", None, np.dtype(np.int32), np.iinfo(np.int32).min),
9191
lib.monetdbe_int64_t: ("int64_t", None, np.dtype(np.int64), np.iinfo(np.int64).min),
92-
lib.monetdbe_int128_t: ("int128_t", None, None, None),
93-
lib.monetdbe_size_t: ("size_t", None, None, None),
92+
lib.monetdbe_int128_t: ("int128_t", None, np.dtype(np.int64), None), # todo: add 128bit support
93+
lib.monetdbe_size_t: ("size_t", None, np.dtype(np.uint), None),
9494
lib.monetdbe_float: ("float", py_float, np.dtype(np.float), np.finfo(np.float).min),
9595
lib.monetdbe_double: ("double", py_float, np.dtype(np.float), np.finfo(np.float).min),
96-
lib.monetdbe_str: ("str", make_string, np.dtype(np.str), None),
97-
lib.monetdbe_blob: ("blob", make_blob, None, None),
98-
lib.monetdbe_date: ("date", py_date, np.dtype(np.datetime64), None),
99-
lib.monetdbe_time: ("time", py_time, np.dtype(np.datetime64), None),
100-
lib.monetdbe_timestamp: ("timestamp", py_timestamp, np.dtype(np.datetime64), None),
96+
lib.monetdbe_str: ("str", make_string, np.dtype('=O'), None),
97+
lib.monetdbe_blob: ("blob", make_blob, np.dtype('=O'), None),
98+
lib.monetdbe_date: ("date", py_date, np.dtype('=O'), None), # np.dtype('datetime64[D]')
99+
lib.monetdbe_time: ("time", py_time, np.dtype('=O'), None), # np.dtype('datetime64[ns]')
100+
lib.monetdbe_timestamp: ("timestamp", py_timestamp, np.dtype('=O'), None), # np.dtype('datetime64[ns]')
101101
}
102102

103103

@@ -169,21 +169,29 @@ def cleanup_result(self, result: ffi.CData):
169169
if result and self._connection:
170170
check_error(lib.monetdbe_cleanup_result(self._connection, result))
171171

172-
def open(self, dbdir: Optional[Path] = None):
172+
def open(
173+
self,
174+
dbdir: Optional[Path] = None,
175+
memorylimit: int = 0,
176+
querytimeout: int = 0,
177+
sessiontimeout: int = 0,
178+
nr_threads: int = 0,
179+
have_hge: bool = False
180+
):
181+
173182
if not dbdir:
174183
url = ffi.NULL
175184
else:
176-
url = str(dbdir).encode() # ffi.new("char[]", str(dbdir).encode())
185+
url = str(dbdir).encode()
177186

178187
p_connection = ffi.new("monetdbe_database *")
179-
# p_options = ffi.new("monetdbe_options *")
180-
# p_options.memorylimit = 0
181-
# p_options.querytimeout = 0
182-
# p_options.sessiontimeout = 0
183-
# p_options.nr_threads = 0
184-
# p_options.have_hge = False
185188

186-
p_options = ffi.NULL
189+
p_options = ffi.new("monetdbe_options *")
190+
p_options.memorylimit = memorylimit
191+
p_options.querytimeout = querytimeout
192+
p_options.sessiontimeout = sessiontimeout
193+
p_options.nr_threads = nr_threads
194+
p_options.have_hge = have_hge
187195

188196
result_code = lib.monetdbe_open(p_connection, url, p_options)
189197
connection = p_connection[0]
@@ -204,7 +212,7 @@ def open(self, dbdir: Optional[Path] = None):
204212

205213
return connection
206214

207-
def close(self):
215+
def close(self) -> None:
208216
if self._connection:
209217
if lib.monetdbe_close(self._connection):
210218
raise exceptions.OperationalError("Failed to close database")
@@ -258,10 +266,13 @@ def result_fetch_numpy(self, monetdbe_result: ffi.CData):
258266
rcol = p_rcol[0]
259267
name = make_string(rcol.name)
260268
cast_string, cast_function, numpy_type, monetdbe_null = type_map[rcol.type]
261-
# todo (gijs): typing
262-
buffer_size = monetdbe_result.nrows * numpy_type.itemsize # type: ignore
263-
c_buffer = ffi.buffer(rcol.data, buffer_size)
264-
np_col = np.frombuffer(c_buffer, dtype=numpy_type)
269+
270+
if numpy_type.char == 'O':
271+
np_col: np.ndarray = np.array([extract(rcol, r) for r in range(monetdbe_result.nrows)])
272+
else:
273+
buffer_size = monetdbe_result.nrows * numpy_type.itemsize
274+
c_buffer = ffi.buffer(rcol.data, buffer_size)
275+
np_col = np.frombuffer(c_buffer, dtype=numpy_type)
265276

266277
if monetdbe_null:
267278
mask = np_col == monetdbe_null

monetdbe/monetize.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,20 @@ def monet_memoryview(data: memoryview) -> str:
5050
return "'%s'" % data.tobytes().hex()
5151

5252

53+
def monet_float(data: float) -> str:
54+
if data != data: # yes this is how you can check if a float is a NaN
55+
return 'NULL'
56+
else:
57+
return str(data)
58+
59+
5360
mapping: List[Tuple[Type, Callable]] = [
5461
(str, monet_escape),
5562
(bytes, monet_bytes),
5663
(memoryview, monet_memoryview),
5764
(int, str),
5865
(complex, str),
59-
(float, str),
66+
(float, monet_float),
6067
(decimal.Decimal, str),
6168
(datetime.datetime, monet_escape),
6269
(datetime.time, monet_escape),

notebooks/basic_example.ipynb

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from monetdbe import connect, Timestamp\n",
10+
"from datetime import datetime"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": 2,
16+
"metadata": {},
17+
"outputs": [],
18+
"source": [
19+
"con = connect(autocommit=True) # open an in-memory database"
20+
]
21+
},
22+
{
23+
"cell_type": "code",
24+
"execution_count": 3,
25+
"metadata": {},
26+
"outputs": [],
27+
"source": [
28+
"cur = con.execute(\"create table example(d timestamp, i int, f float)\")"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": 4,
34+
"metadata": {},
35+
"outputs": [],
36+
"source": [
37+
"_ = cur.executemany(\"insert into example(d, i, f) values (?, ?, ?)\", (\n",
38+
" (datetime.now(), 10, 0.1),\n",
39+
" (Timestamp(2004, 2, 14, 7, 15, 0, 510241), 20, 0.2),\n",
40+
"))"
41+
]
42+
},
43+
{
44+
"cell_type": "code",
45+
"execution_count": 5,
46+
"metadata": {},
47+
"outputs": [],
48+
"source": [
49+
"_ = cur.execute(\"select * from example\")"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 6,
55+
"metadata": {},
56+
"outputs": [
57+
{
58+
"data": {
59+
"text/plain": [
60+
"[(datetime.datetime(2020, 6, 29, 15, 40, 35, 605000), 10, 0.1),\n",
61+
" (datetime.datetime(2004, 2, 14, 7, 15, 0, 510000), 20, 0.2)]"
62+
]
63+
},
64+
"execution_count": 6,
65+
"metadata": {},
66+
"output_type": "execute_result"
67+
}
68+
],
69+
"source": [
70+
"cur.fetchall()"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 13,
76+
"metadata": {},
77+
"outputs": [],
78+
"source": [
79+
"_ = cur.execute(\"select * from example\")"
80+
]
81+
},
82+
{
83+
"cell_type": "code",
84+
"execution_count": 14,
85+
"metadata": {},
86+
"outputs": [],
87+
"source": [
88+
"array = cur.fetchdf()"
89+
]
90+
},
91+
{
92+
"cell_type": "code",
93+
"execution_count": 17,
94+
"metadata": {},
95+
"outputs": [
96+
{
97+
"data": {
98+
"text/plain": [
99+
"(Timestamp('1970-01-01 02:24:39.017219613'),\n",
100+
" Timestamp('1817-02-23 09:10:54.898455133'))"
101+
]
102+
},
103+
"execution_count": 17,
104+
"metadata": {},
105+
"output_type": "execute_result"
106+
}
107+
],
108+
"source": [
109+
"tuple(array['d'])"
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"execution_count": null,
115+
"metadata": {},
116+
"outputs": [],
117+
"source": []
118+
}
119+
],
120+
"metadata": {
121+
"kernelspec": {
122+
"display_name": "Python 3",
123+
"language": "python",
124+
"name": "python3"
125+
},
126+
"language_info": {
127+
"codemirror_mode": {
128+
"name": "ipython",
129+
"version": 3
130+
},
131+
"file_extension": ".py",
132+
"mimetype": "text/x-python",
133+
"name": "python",
134+
"nbconvert_exporter": "python",
135+
"pygments_lexer": "ipython3",
136+
"version": "3.8.2"
137+
}
138+
},
139+
"nbformat": 4,
140+
"nbformat_minor": 4
141+
}

tests/test_dataframe.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from datetime import datetime
2+
from typing import List, Any
3+
from unittest import TestCase
4+
from math import isnan
5+
from pandas import DataFrame
6+
7+
from monetdbe import connect, Timestamp
8+
9+
10+
def _connect(values: List[Any], type: str) -> DataFrame:
11+
con = connect(autocommit=True)
12+
cur = con.execute(f"create table example(d {type})")
13+
cur.executemany("insert into example(d) values (?)", ((v,) for v in values))
14+
cur.execute("select * from example")
15+
return cur.fetchdf()
16+
17+
18+
class TestDataFrame(TestCase):
19+
def test_timestamp(self):
20+
now = datetime.now().replace(microsecond=0) # monetdb doesn't support microseconds
21+
values = [
22+
now,
23+
Timestamp(2004, 2, 14, 7, 15, 0, 510000),
24+
]
25+
df = _connect(values, 'timestamp')
26+
self.assertEqual(values, list(df['d']))
27+
28+
def test_int(self):
29+
values = [5, 10, -100]
30+
df = _connect(values, 'int')
31+
self.assertEqual(values, list(df['d']))
32+
33+
def test_float(self):
34+
values = [5.0, 10.0, -100.0, float('nan')]
35+
df = _connect(values, 'float')
36+
self.assertEqual(values[:-1], list(df['d'])[:-1])
37+
self.assertTrue(isnan(df['d'].iloc[-1]))
38+
39+
def test_char(self):
40+
values = ['a', 'i', 'é']
41+
df = _connect(values, 'char')
42+
self.assertEqual(values, list(df['d']))
43+
44+
def test_string(self):
45+
values = ['asssssssssssssssss', 'iwwwwwwwwwwwwwww', 'éooooooooooooooooooooo']
46+
df = _connect(values, 'string')
47+
self.assertEqual(values, list(df['d']))
48+
49+
def test_varchar(self):
50+
values = ['a', 'aa', 'éooooooooooooooooooooo']
51+
df = _connect(values, 'string')
52+
self.assertEqual(values, list(df['d']))
53+
54+
def test_uuid(self):
55+
values = ['6c49869d-45dc-4b00-ae55-5bd363c0c72c', '2ad49a96-ba10-11ea-b3de-0242ac130004']
56+
df = _connect(values, 'uuid')
57+
self.assertEqual(values, list(df['d']))

tests/test_lite/test_dbapi05.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ def test_description(self, monetdbe_cursor):
99
def test_description_fields(self, monetdbe_cursor):
1010
monetdbe_cursor.execute('select name from sys.tables')
1111
assert monetdbe_cursor.description[0][0] == "name"
12-
assert monetdbe_cursor.description[0][1] == numpy.dtype('<U')
12+
assert monetdbe_cursor.description[0][1] == numpy.dtype('O')

0 commit comments

Comments
 (0)