Vadim Markovtsev, Athenian.
Vadim Markovtsev
Athenian
pickle.dumps()
, to_arrow()
are very slow
object
dtype.memcpy
of the internals.nogil
).PyObject *
.assert PyArray_IS_C_CONTIGUOUS(arr)
assert PyArray_NDIM(arr) == 1
assert PyArray_DESCR(arr).kind == b"O"
cdef PyObject **data = <PyObject **> PyArray_DATA(arr)
for i in range(PyArray_DIM(arr, 0)):
serialize(data[i])
for i in range(PyList_GET_SIZE(obj)):
serialize(PyList_GET_ITEM(obj, i))
while PyDict_Next(obj, &pos, &key, &val):
serialize(key)
serialize(val)
double PyFloat_AS_DOUBLE(PyObject *)
long PyLong_AsLong(PyObject *)
void PyArray_ScalarAsCtype(PyObject *scalar, void *ctype)
memcpy(buffer, &value, sizeof(value))
Internal representation: smart UCS1, UCS2, or UCS4.
memcpy(
buffer,
PyUnicode_DATA(obj),
PyUnicode_GET_LENGTH(obj) * PyUnicode_KIND(obj)
)
datetime
and timedelta
PyDateTime_CAPI
: internal representation is a struct, not a timestamp
int PyDateTime_GET_YEAR(PyObject *)
int PyDateTime_GET_MONTH(PyObject *)
int PyDateTime_DATE_GET_SECOND(PyObject *)
int PyDateTime_DELTA_GET_DAYS(PyObject *)
int PyDateTime_DELTA_GET_SECONDS(PyObject *)
@dataclass(slots=True)
class Movie:
name: str
rating: float
actors: list[Actor]
Typical paginated request without DB pushdown:
blob = await load_from_cache(key)
movies = deserialize(blob)
selected = movies[offset:offset + limit]
return to_json(selected)
Slow as hell with our "movies":
blob = await load_from_cache(key)
movies = deserialize(blob)
selected = movies[offset:offset + limit]
return to_json(selected)
First call:
movies = await bake_movies(...)
dicts = to_atoms(movies)
await store_to_cache(dicts, key)
return to_json(dicts[:limit]), key
n + 1 call:
blob = await load_from_cache(key)
dicts = deserialize(blob)
return to_json(dicts[offset:offset + limit]), key
First call:
movies = await bake_movies(...)
dicts = to_atoms(movies)
await store_to_cache(serialize(dicts), key)
return to_json(dicts[:limit]), key
n + 1 call:
blob = await load_from_cache(key)
dicts = deserialize(blob)
return to_json(dicts[offset:offset + limit]), key
to_json_vadim(movies)
'[{"name": "RRR", ...},{"name": "Up", ...},{...'
[ 1, 100, 200]
First call:
movies = await bake_movies(...)
blob, toc = to_json_vadim(movies)
await store_to_cache(serialize((blob, toc)), key)
selected = blob[:toc[limit] - 1]
return f'{{"key": "{key}", "movies": [{selected}]}}'
n + 1 call:
blob, toc = deserialize(await load_from_cache(key))
selected = blob[toc[offset]:toc[offset + limit] - 1]
return f'{{"key": "{key}", "movies": [{selected}]}}'
to_json_vadim(movies)
movie
according to the spec.list
, dict
int
, float
to strdatetime
, timedelta
to strstr
to str: 'esc"ape\n' -> r'"esc\"ape\n"'; utf8
ctypedef struct SpecNode:
DataType type
Py_ssize_t offset
PyTypeObject *model
vector[SpecNode] nested
cdef enum DataType:
DT_INVALID = 0 DT_DT = 7
DT_MODEL = 1 DT_TD = 8
DT_LIST = 2 DT_BOOL = 9
DT_DICT = 3
DT_LONG = 4
DT_FLOAT = 5
DT_STRING = 6
@constrained_json_spec
@dataclass(slots=True)
class Movie:
name: str
rating: float
actors: list[Actor]
# __json_spec__: ClassVar[PyCapsule]