From 632c384020733a493203741159d2c8d8b2a9b19b Mon Sep 17 00:00:00 2001 From: "Ilya (Marshal)" Date: Sun, 31 May 2026 03:06:07 +0200 Subject: [PATCH] Restructure single-file lib.rs into per-domain modules --- src/car.rs | 106 +++++ src/cid.rs | 45 ++ src/cid/de.rs | 29 ++ src/cid/ser.rs | 9 + src/convert.rs | 20 + src/dag_cbor.rs | 8 + src/dag_cbor/de.rs | 241 +++++++++++ src/dag_cbor/ser.rs | 218 ++++++++++ src/error.rs | 6 + src/ffi.rs | 16 + src/ffi/dict.rs | 48 +++ src/ffi/int.rs | 31 ++ src/ffi/key_cache.rs | 122 ++++++ src/ffi/recursion.rs | 10 + src/ffi/string.rs | 35 ++ src/ffi/sys.rs | 13 + src/io.rs | 9 + src/io/leb128.rs | 40 ++ src/io/reader.rs | 42 ++ src/io/writer.rs | 29 ++ src/lib.rs | 997 +------------------------------------------ src/multibase.rs | 7 + src/multibase/de.rs | 17 + src/multibase/ser.rs | 18 + 24 files changed, 1136 insertions(+), 980 deletions(-) create mode 100644 src/car.rs create mode 100644 src/cid.rs create mode 100644 src/cid/de.rs create mode 100644 src/cid/ser.rs create mode 100644 src/convert.rs create mode 100644 src/dag_cbor.rs create mode 100644 src/dag_cbor/de.rs create mode 100644 src/dag_cbor/ser.rs create mode 100644 src/error.rs create mode 100644 src/ffi.rs create mode 100644 src/ffi/dict.rs create mode 100644 src/ffi/int.rs create mode 100644 src/ffi/key_cache.rs create mode 100644 src/ffi/recursion.rs create mode 100644 src/ffi/string.rs create mode 100644 src/ffi/sys.rs create mode 100644 src/io.rs create mode 100644 src/io/leb128.rs create mode 100644 src/io/reader.rs create mode 100644 src/io/writer.rs create mode 100644 src/multibase.rs create mode 100644 src/multibase/de.rs create mode 100644 src/multibase/ser.rs diff --git a/src/car.rs b/src/car.rs new file mode 100644 index 0000000..410d1ee --- /dev/null +++ b/src/car.rs @@ -0,0 +1,106 @@ +//! CAR (Content Addressable aRchive) v1 container decoding. Encoding is not +//! implemented yet; when it lands this becomes `car/{de,ser}.rs`. + +use cbor4ii::core::dec::Read; +use pyo3::prelude::*; +use pyo3::types::*; + +use crate::dag_cbor::de::to_pyobject; +use crate::error::value_error; +use crate::ffi::recursion::current_recursion_limit; +use crate::io::leb128::read_u64; +use crate::io::SliceReader; + +#[pyfunction] +pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(Py, Bound<'py, PyDict>)> { + let buf = &mut SliceReader::new(data); + let max_depth = current_recursion_limit(); + + if read_u64(buf).is_err() { + return Err(value_error( + "Failed to read CAR header", + "Invalid uvarint".to_string(), + )); + } + let Ok(header_obj) = to_pyobject(py, buf, 0, max_depth) else { + return Err(value_error( + "Failed to read CAR header", + "Invalid DAG-CBOR".to_string(), + )); + }; + + let header = header_obj.cast_bound::(py)?; + + let Some(version) = header.get_item("version")? else { + return Err(value_error( + "Failed to read CAR header", + "Version is None".to_string(), + )); + }; + if version.cast::()?.extract::()? != 1 { + return Err(value_error( + "Failed to read CAR header", + "Unsupported version. Version must be 1".to_string(), + )); + } + + let Some(roots) = header.get_item("roots")? else { + return Err(value_error( + "Failed to read CAR header", + "Roots is None".to_string(), + )); + }; + if roots.cast::()?.len() == 0 { + return Err(value_error( + "Failed to read CAR header", + "Roots is empty. Must be at least one".to_string(), + )); + } + + // FIXME (MarshalX): we are not verifying if the roots are valid CIDs + + let parsed_blocks = PyDict::new(py); + + loop { + if read_u64(buf).is_err() { + // FIXME (MarshalX): we are not raising an error here because of possible EOF + break; + } + + let cid_bytes_before = buf.buf; + // `&[u8]` is itself an `io::Read`, so we hand it to `Cid::read_bytes` + // directly and recover the consumed length from the slice shrink. + let mut slice: &[u8] = cid_bytes_before; + let cid_result = ::cid::Cid::read_bytes(&mut slice); + let Ok(cid) = cid_result else { + return Err(value_error( + "Failed to read CID of block", + cid_result.unwrap_err().to_string(), + )); + }; + + if cid.codec() != 0x71 { + return Err(value_error( + "Failed to read CAR block", + "Unsupported codec. For now we support only DAG-CBOR (0x71)".to_string(), + )); + } + + let consumed = cid_bytes_before.len() - slice.len(); + buf.advance(consumed); + let cid_raw = &cid_bytes_before[..consumed]; + + let block_result = to_pyobject(py, buf, 0, max_depth); + let Ok(block) = block_result else { + return Err(value_error( + "Failed to read CAR block", + block_result.unwrap_err().to_string(), + )); + }; + + let key = PyBytes::new(py, cid_raw).into_pyobject(py)?; + parsed_blocks.set_item(key, block)?; + } + + Ok((header_obj, parsed_blocks)) +} diff --git a/src/cid.rs b/src/cid.rs new file mode 100644 index 0000000..834d6db --- /dev/null +++ b/src/cid.rs @@ -0,0 +1,45 @@ +//! CID (Content IDentifier) codec plus the shared CID helpers used across +//! codecs: extraction from arbitrary Python objects and the O(1) shape check. + +pub(crate) mod de; +pub(crate) mod ser; + +pub(crate) use de::decode_cid; +pub(crate) use ser::encode_cid; + +use pyo3::prelude::*; +use pyo3::types::*; + +use crate::convert::extract_bytes; +use crate::error::value_error; + +// `Cid::try_from` parses two varints + a multihash on every call; this O(1) +// shape check rejects payloads that can't be a CID without paying for it. +// CIDv1 starts with `0x01`; CIDv0 is exactly 34 bytes starting `0x12 0x20`. +#[inline] +pub(crate) fn looks_like_cid(bytes: &[u8]) -> bool { + if bytes.len() < 4 { + return false; + } + if bytes[0] == 0x01 { + return true; + } + bytes.len() == 34 && bytes[0] == 0x12 && bytes[1] == 0x20 +} + +pub(crate) fn extract_cid(data: &Bound) -> PyResult<::cid::Cid> { + let cid = if let Ok(s) = data.cast::() { + ::cid::Cid::try_from(s.to_str()?) + } else { + ::cid::Cid::try_from(extract_bytes(data)?) + }; + + if let Ok(cid) = cid { + Ok(cid) + } else { + Err(value_error( + "Failed to decode CID", + cid.unwrap_err().to_string(), + )) + } +} diff --git a/src/cid/de.rs b/src/cid/de.rs new file mode 100644 index 0000000..f0e06ee --- /dev/null +++ b/src/cid/de.rs @@ -0,0 +1,29 @@ +use pyo3::prelude::*; +use pyo3::types::*; + +use crate::cid::extract_cid; + +fn hash_to_pydict<'py>(py: Python<'py>, cid: &::cid::Cid) -> PyResult> { + let hash = cid.hash(); + let dict_obj = PyDict::new(py); + + dict_obj.set_item("code", hash.code())?; + dict_obj.set_item("size", hash.size())?; + dict_obj.set_item("digest", PyBytes::new(py, hash.digest()))?; + + Ok(dict_obj) +} + +fn to_pydict<'py>(py: Python<'py>, cid: &::cid::Cid) -> PyResult> { + let dict_obj = PyDict::new(py); + + dict_obj.set_item("version", cid.version() as u64)?; + dict_obj.set_item("codec", cid.codec())?; + dict_obj.set_item("hash", hash_to_pydict(py, cid)?)?; + Ok(dict_obj) +} + +#[pyfunction] +pub fn decode_cid<'py>(py: Python<'py>, data: &Bound) -> PyResult> { + to_pydict(py, &extract_cid(data)?) +} diff --git a/src/cid/ser.rs b/src/cid/ser.rs new file mode 100644 index 0000000..889de52 --- /dev/null +++ b/src/cid/ser.rs @@ -0,0 +1,9 @@ +use pyo3::prelude::*; +use pyo3::types::*; + +use crate::cid::extract_cid; + +#[pyfunction] +pub fn encode_cid<'py>(py: Python<'py>, data: &Bound) -> PyResult> { + Ok(PyString::new(py, extract_cid(data)?.to_string().as_str())) +} diff --git a/src/convert.rs b/src/convert.rs new file mode 100644 index 0000000..b1525f5 --- /dev/null +++ b/src/convert.rs @@ -0,0 +1,20 @@ +use pyo3::prelude::*; +use pyo3::types::*; + +use crate::error::value_error; + +/// Borrow a byte view from a `bytes`, `bytearray`, or `str` (UTF-8) object. +pub(crate) fn extract_bytes<'py>(obj: &'py Bound<'py, PyAny>) -> PyResult<&'py [u8]> { + if let Ok(b) = obj.cast::() { + Ok(b.as_bytes()) + } else if let Ok(ba) = obj.cast::() { + Ok(unsafe { ba.as_bytes() }) + } else if let Ok(s) = obj.cast::() { + Ok(s.to_str()?.as_bytes()) + } else { + Err(value_error( + "Failed to encode multibase", + "Unsupported data type".to_string(), + )) + } +} diff --git a/src/dag_cbor.rs b/src/dag_cbor.rs new file mode 100644 index 0000000..2a72a5e --- /dev/null +++ b/src/dag_cbor.rs @@ -0,0 +1,8 @@ +//! DAG-CBOR codec: decode (`de`) and encode (`ser`) of the IPLD data model +//! to and from native Python objects. + +pub(crate) mod de; +pub(crate) mod ser; + +pub(crate) use de::{decode_dag_cbor, decode_dag_cbor_multi}; +pub(crate) use ser::encode_dag_cbor; diff --git a/src/dag_cbor/de.rs b/src/dag_cbor/de.rs new file mode 100644 index 0000000..5522ac7 --- /dev/null +++ b/src/dag_cbor/de.rs @@ -0,0 +1,241 @@ +use anyhow::{anyhow, Result}; +use cbor4ii::core::{ + dec::{self, Decode, Read}, + major, marker, types, +}; +use pyo3::{ffi, prelude::*, types::*, BoundObject}; + +use crate::error::value_error; +use crate::ffi::dict::new_presized; +use crate::ffi::key_cache::intern; +use crate::ffi::recursion::current_recursion_limit; +use crate::ffi::string::from_bytes; +use crate::io::{peek_one, SliceReader}; + +#[cfg(CPython)] +use crate::ffi::dict::set_item_known_hash; + +fn map_key_cmp(a: &[u8], b: &[u8]) -> std::cmp::Ordering { + /* The keys in every map must be sorted length-first by the byte representation of the string keys, where: + - If two keys have different lengths, the shorter one sorts earlier; + - If two keys have the same length, the one with the lower value in (byte-wise) lexical order sorts earlier. + */ + if a.len() != b.len() { + a.len().cmp(&b.len()) + } else { + a.cmp(b) + } +} + +pub(crate) fn to_pyobject<'de, R: dec::Read<'de>>( + py: Python, + r: &mut R, + depth: usize, + max_depth: usize, +) -> Result> +where + R::Error: Send + Sync, +{ + if depth > max_depth { + PyErr::new::( + "RecursionError: maximum recursion depth exceeded in DAG-CBOR decoding", + ) + .restore(py); + + return Err(anyhow!("Maximum recursion depth exceeded")); + } + + let byte = peek_one(r)?; + Ok(match dec::if_major(byte) { + major::UNSIGNED => u64::decode(r)?.into_pyobject(py)?.into(), + major::NEGATIVE => i128::decode(r)?.into_pyobject(py)?.into(), + major::BYTES => PyBytes::new(py, >::decode(r)?.0) + .into_pyobject(py)? + .into(), + major::STRING => { + // ASCII fast path inside the helper; non-ASCII falls through to + // `PyUnicode_DecodeUTF8`, which is where the spec validation lives. + from_bytes( + py, + >::decode(r) + .map_err(|_| anyhow!("Cannot decode as bytes"))? + .0, + )? + .into() + } + major::ARRAY => { + let len: ffi::Py_ssize_t = types::Array::len(r)? + .ok_or_else(|| anyhow!("Array must contain length"))? + .try_into()?; + + unsafe { + let ptr = ffi::PyList_New(len); + + for i in 0..len { + ffi::PyList_SET_ITEM( + ptr, + i, + to_pyobject(py, r, depth + 1, max_depth)?.into_ptr(), + ); + } + + let list: Bound<'_, PyList> = Bound::from_owned_ptr(py, ptr).cast_into_unchecked(); + list.into_pyobject(py)?.into() + } + } + major::MAP => { + let len = types::Map::len(r)?.ok_or_else(|| anyhow!("Map must contain length"))?; + // Length is known up front; presize to avoid rehashes as we fill. + let dict = unsafe { + let ptr = new_presized(len); + if ptr.is_null() { + return Err(anyhow!(PyErr::fetch(py))); + } + Bound::from_owned_ptr(py, ptr).cast_into_unchecked::() + }; + + let mut prev_key: Option<&[u8]> = None; + for _ in 0..len { + // DAG-CBOR keys are always strings. Python does the UTF-8 validation when creating + // the string. + let key = >::decode(r) + .map_err(|_| anyhow!("Map keys must be strings"))? + .0; + + if let Some(prev_key) = prev_key { + // it cares about duplicated keys too thanks to Ordering::Equal + if map_key_cmp(prev_key, key) != std::cmp::Ordering::Less { + return Err(anyhow!("Map keys must be sorted and unique")); + } + } + + prev_key = Some(key); + + let (key_ptr, key_hash) = unsafe { intern(py, key)? }; + let key_bound: Bound<'_, PyAny> = unsafe { Bound::from_owned_ptr(py, key_ptr) }; + + let value_py = to_pyobject(py, r, depth + 1, max_depth)?; + + #[cfg(CPython)] + unsafe { + set_item_known_hash(py, &dict, &key_bound, value_py, key_hash)?; + } + #[cfg(not(CPython))] + { + let _ = key_hash; + dict.set_item(&key_bound, value_py)?; + } + } + + dict.into_pyobject(py)?.into() + } + major::TAG => { + let value = types::Tag::tag(r)?; + if value != 42 { + return Err(anyhow!("Non-42 tags are not supported")); + } + + let cid = >::decode(r)?.0; + + // we expect CIDs to have a leading zero byte + if cid.len() <= 1 || cid[0] != 0 { + return Err(anyhow!("Invalid CID")); + } + + let cid_without_prefix = &cid[1..]; + if ::cid::Cid::try_from(cid_without_prefix).is_err() { + return Err(anyhow!("Invalid CID")); + } + + PyBytes::new(py, cid_without_prefix) + .into_pyobject(py)? + .into() + } + major::SIMPLE => match byte { + // FIXME(MarshalX): should be more clear for bool? + marker::FALSE => { + r.advance(1); + false.into_pyobject(py)?.into_any().unbind() + } + marker::TRUE => { + r.advance(1); + true.into_pyobject(py)?.into_any().unbind() + } + marker::NULL => { + r.advance(1); + py.None() + } + marker::F32 => { + let value = f32::decode(r)?; + if !value.is_finite() { + return Err(anyhow!( + "Number out of range for f32 (NaNs are forbidden)".to_string() + )); + } + value.into_pyobject(py)?.into() + } + marker::F64 => { + let value = f64::decode(r)?; + if !value.is_finite() { + return Err(anyhow!( + "Number out of range for f64 (NaNs are forbidden)".to_string() + )); + } + value.into_pyobject(py)?.into() + } + _ => return Err(anyhow!("Unsupported major type".to_string())), + }, + _ => return Err(anyhow!("Invalid major type".to_string())), + }) +} + +#[pyfunction] +pub fn decode_dag_cbor_multi<'py>(py: Python<'py>, data: &[u8]) -> PyResult> { + let mut reader = SliceReader::new(data); + let decoded_parts = PyList::empty(py); + let max_depth = current_recursion_limit(); + + loop { + let py_object = to_pyobject(py, &mut reader, 0, max_depth); + if let Ok(py_object) = py_object { + decoded_parts.append(py_object)?; + } else { + break; + } + } + + Ok(decoded_parts) +} + +#[pyfunction] +pub fn decode_dag_cbor(py: Python, data: &[u8]) -> PyResult> { + let mut reader = SliceReader::new(data); + let max_depth = current_recursion_limit(); + let py_object = to_pyobject(py, &mut reader, 0, max_depth); + if let Ok(py_object) = py_object { + // check for any remaining data in the reader + if reader.fill(1)?.as_ref().is_empty() { + Ok(py_object) + } else { + Err(value_error( + "Failed to decode DAG-CBOR", + "Invalid DAG-CBOR: contains multiple objects (CBOR sequence)".to_string(), + )) + } + } else { + let err = value_error( + "Failed to decode DAG-CBOR", + py_object.unwrap_err().to_string(), + ); + + if let Some(py_err) = PyErr::take(py) { + py_err.set_cause(py, Option::from(err)); + // in case something set global interpreter’s error, + // for example C FFI function, we should return it + // the real case: RecursionError (set by Py_EnterRecursiveCall) + Err(py_err) + } else { + Err(err) + } + } +} diff --git a/src/dag_cbor/ser.rs b/src/dag_cbor/ser.rs new file mode 100644 index 0000000..280408d --- /dev/null +++ b/src/dag_cbor/ser.rs @@ -0,0 +1,218 @@ +use anyhow::{anyhow, Result}; +use cbor4ii::core::{ + enc::{self, Encode}, + types, +}; +use pyo3::pybacked::PyBackedStr; +use pyo3::{ffi, prelude::*, types::*}; + +use crate::cid::looks_like_cid; +use crate::error::value_error; +use crate::io::VecWriter; + +struct PrefixedCidBytes<'a>(&'a [u8]); + +impl<'a> Encode for PrefixedCidBytes<'a> { + fn encode(&self, w: &mut W) -> Result<(), enc::Error> { + // length prefix for bytes: 1 (leading 0) + payload + types::Bytes::bounded(1 + self.0.len(), w)?; + w.push(&[0x00])?; + w.push(self.0)?; + Ok(()) + } +} + +// One dict walk collects (key, value) pairs together; sorting by-index and +// re-fetching values through `map.values()` would materialize two extra +// PyLists and walk the dict three times. +fn sorted_map_entries<'py>( + map: &Bound<'py, PyDict>, +) -> Result)>> { + let len = map.len(); + let mut entries: Vec<(PyBackedStr, Bound<'py, PyAny>)> = Vec::with_capacity(len); + + for (key, value) in map.iter() { + let key_str = match key.cast_into::() { + Ok(k) => k, + Err(_) => return Err(anyhow!("Map keys must be strings")), + }; + let backed = PyBackedStr::try_from(key_str) + .map_err(|_| anyhow!("Failed to convert PyString to PyBackedStr"))?; + entries.push((backed, value)); + } + + if entries.len() >= 2 { + entries.sort_by(|a, b| { + // sort_unstable_by performs bad in past benchmarks; revisit if data shape changes. + let (s1, _) = a; + let (s2, _) = b; + if s1.len() != s2.len() { + s1.len().cmp(&s2.len()) + } else { + s1.as_bytes().cmp(s2.as_bytes()) + } + }); + } + + Ok(entries) +} + +#[inline] +fn encode_int(obj: &Bound<'_, PyAny>, w: &mut W) -> Result<()> +where + W::Error: Send + Sync, +{ + #[cfg(all(CPython, Py_3_12))] + { + if let Some((abs_val, neg)) = unsafe { crate::ffi::int::pylong_parts(obj.as_ptr()) } { + if neg { + types::Negative(abs_val - 1).encode(w)?; + } else { + abs_val.encode(w)?; + } + return Ok(()); + } + } + + let i: i128 = obj.extract()?; + if i.is_negative() { + if -(i + 1) > u64::MAX as i128 { + return Err(anyhow!("Number out of range")); + } + types::Negative(-(i + 1) as u64).encode(w)?; + } else { + if i > u64::MAX as i128 { + return Err(anyhow!("Number out of range")); + } + (i as u64).encode(w)?; + } + Ok(()) +} + +fn from_pyobject<'py, W: enc::Write>( + _py: Python<'py>, + obj: &Bound<'py, PyAny>, + w: &mut W, +) -> Result<()> +where + W::Error: Send + Sync, +{ + // Exact-type pointer compare per branch avoids the MRO walk that + // `is_instance_of` / `cast` perform. Order tuned for typical ATProto + // record shapes; subclasses fall through to the slow path below. + let tp = unsafe { ffi::Py_TYPE(obj.as_ptr()) }; + unsafe { + if tp == &raw mut ffi::PyUnicode_Type { + let s = obj.cast_unchecked::(); + s.to_str()?.encode(w)?; + return Ok(()); + } + if tp == &raw mut ffi::PyDict_Type { + let map = obj.cast_unchecked::(); + let entries = sorted_map_entries(map)?; + types::Map::bounded(entries.len(), w)?; + for (key, value) in &entries { + (&**key).encode(w)?; + from_pyobject(_py, value, w)?; + } + return Ok(()); + } + if tp == &raw mut ffi::PyList_Type { + let l = obj.cast_unchecked::(); + let len = l.len(); + types::Array::bounded(len, w)?; + for i in 0..len { + let item = l.get_item_unchecked(i); + from_pyobject(_py, &item, w)?; + } + return Ok(()); + } + if tp == &raw mut ffi::PyLong_Type { + return encode_int(obj, w); + } + if tp == &raw mut ffi::PyBytes_Type { + let b = obj.cast_unchecked::(); + let bytes = b.as_bytes(); + if looks_like_cid(bytes) && ::cid::Cid::try_from(bytes).is_ok() { + // by providing custom encoding we avoid extra allocation + types::Tag(42, PrefixedCidBytes(bytes)).encode(w)?; + } else { + types::Bytes(bytes).encode(w)?; + } + return Ok(()); + } + if tp == &raw mut ffi::PyBool_Type { + (obj.as_ptr() == ffi::Py_True()).encode(w)?; + return Ok(()); + } + if obj.as_ptr() == ffi::Py_None() { + types::Null.encode(w)?; + return Ok(()); + } + if tp == &raw mut ffi::PyFloat_Type { + let f = obj.cast_unchecked::(); + let v = f.value(); + if !v.is_finite() { + return Err(anyhow!("Number out of range")); + } + v.encode(w)?; + return Ok(()); + } + } + + // Slow path: subclasses of supported types (rare in DAG-CBOR usage). + if obj.is_instance_of::() { + (obj.as_ptr() == unsafe { ffi::Py_True() }).encode(w)?; + Ok(()) + } else if obj.is_instance_of::() { + encode_int(obj, w) + } else if let Ok(l) = obj.cast::() { + let len = l.len(); + types::Array::bounded(len, w)?; + for i in 0..len { + let item = unsafe { l.get_item_unchecked(i) }; + from_pyobject(_py, &item, w)?; + } + Ok(()) + } else if let Ok(map) = obj.cast::() { + let entries = sorted_map_entries(map)?; + types::Map::bounded(entries.len(), w)?; + for (key, value) in &entries { + (&**key).encode(w)?; + from_pyobject(_py, value, w)?; + } + Ok(()) + } else if let Ok(s) = obj.cast::() { + s.to_str()?.encode(w)?; + Ok(()) + } else if let Ok(b) = obj.cast::() { + let bytes = b.as_bytes(); + if looks_like_cid(bytes) && ::cid::Cid::try_from(bytes).is_ok() { + types::Tag(42, PrefixedCidBytes(bytes)).encode(w)?; + } else { + types::Bytes(bytes).encode(w)?; + } + Ok(()) + } else if let Ok(f) = obj.cast::() { + let v = f.value(); + if !v.is_finite() { + return Err(anyhow!("Number out of range")); + } + v.encode(w)?; + Ok(()) + } else { + Err(anyhow!("Unknown tag")) + } +} + +#[pyfunction] +pub fn encode_dag_cbor<'py>( + py: Python<'py>, + data: &Bound<'py, PyAny>, +) -> PyResult> { + let mut buf = VecWriter::new(); + if let Err(e) = from_pyobject(py, data, &mut buf) { + return Err(value_error("Failed to encode DAG-CBOR", e.to_string())); + } + Ok(PyBytes::new(py, buf.as_slice())) +} diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..018c0b4 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,6 @@ +use pyo3::PyErr; + +/// Build a `ValueError` of the form `"{msg}. {detail}"`. +pub(crate) fn value_error(msg: &str, detail: String) -> PyErr { + PyErr::new::(format!("{}. {}", msg, detail)) +} diff --git a/src/ffi.rs b/src/ffi.rs new file mode 100644 index 0000000..215a924 --- /dev/null +++ b/src/ffi.rs @@ -0,0 +1,16 @@ +//! Unsafe CPython interop layer. +//! +//! Everything here is `#[cfg]`-gated against the interpreter (CPython vs other, +//! Python version, free-threaded vs GIL) and reaches into CPython internals or +//! object layouts that the public `pyo3` API does not expose. The domain +//! modules call into these fast paths; the danger stays quarantined here. + +pub(crate) mod dict; +pub(crate) mod int; +pub(crate) mod key_cache; +pub(crate) mod recursion; +pub(crate) mod string; + +// Private CPython symbols only resolve on a real CPython build. +#[cfg(CPython)] +pub(crate) mod sys; diff --git a/src/ffi/dict.rs b/src/ffi/dict.rs new file mode 100644 index 0000000..17c4242 --- /dev/null +++ b/src/ffi/dict.rs @@ -0,0 +1,48 @@ +use pyo3::ffi; + +#[cfg(CPython)] +use anyhow::{anyhow, Result}; +#[cfg(CPython)] +use pyo3::prelude::*; +#[cfg(CPython)] +use pyo3::types::PyDict; + +// Empty CPython dicts already have 8 slots, so presizing below that buys +// nothing and lets us stay on the public `PyDict_New` path. +#[inline] +pub(crate) unsafe fn new_presized(len: usize) -> *mut ffi::PyObject { + #[cfg(CPython)] + { + if len > 8 { + crate::ffi::sys::_PyDict_NewPresized(len as ffi::Py_ssize_t) + } else { + ffi::PyDict_New() + } + } + #[cfg(not(CPython))] + { + let _ = len; + ffi::PyDict_New() + } +} + +// Insert by a precomputed `Py_hash_t`, skipping the rehash inside +// `PyDict_SetItem`. Steals the caller's reference to `value`. +#[cfg(CPython)] +#[inline] +pub(crate) unsafe fn set_item_known_hash( + py: Python<'_>, + dict: &Bound<'_, PyDict>, + key: &Bound<'_, PyAny>, + value: Py, + hash: ffi::Py_hash_t, +) -> Result<()> { + let value_ptr = value.into_ptr(); + let rc = + crate::ffi::sys::_PyDict_SetItem_KnownHash(dict.as_ptr(), key.as_ptr(), value_ptr, hash); + ffi::Py_DECREF(value_ptr); + if rc != 0 { + return Err(anyhow!(PyErr::fetch(py))); + } + Ok(()) +} diff --git a/src/ffi/int.rs b/src/ffi/int.rs new file mode 100644 index 0000000..0d56003 --- /dev/null +++ b/src/ffi/int.rs @@ -0,0 +1,31 @@ +#[cfg(all(CPython, Py_3_12))] +use pyo3::ffi; + +// CPython 3.12+ PyLongObject layout: `PyObject_HEAD; uintptr_t lv_tag; digit ob_digit[]`. +// `lv_tag` packs the sign in the low 3 bits (0=positive, 1=zero, 2=negative) and the +// digit count in the upper bits. Default builds use 30-bit digits (uint32_t). +// +// Returns `(abs_val, neg)` for ints that fit in two digits, or `None` when the +// caller should fall back to the generic `i128` extraction path. +#[cfg(all(CPython, Py_3_12))] +#[inline] +pub(crate) unsafe fn pylong_parts(obj: *mut ffi::PyObject) -> Option<(u64, bool)> { + const NON_SIZE_BITS: u32 = 3; + const SIGN_MASK: usize = 3; + const SIGN_NEGATIVE: usize = 2; + const PYLONG_DIGIT_BITS: u32 = 30; + + let lv_tag_ptr = (obj as *const u8).add(std::mem::size_of::()) as *const usize; + let lv_tag = *lv_tag_ptr; + let ndigits = lv_tag >> NON_SIZE_BITS; + let neg = (lv_tag & SIGN_MASK) == SIGN_NEGATIVE; + + let ob_digit = lv_tag_ptr.add(1) as *const u32; + let abs_val: u64 = match ndigits { + 0 => return Some((0, false)), + 1 => *ob_digit as u64, + 2 => (*ob_digit as u64) | ((*ob_digit.add(1) as u64) << PYLONG_DIGIT_BITS), + _ => return None, + }; + Some((abs_val, neg)) +} diff --git a/src/ffi/key_cache.rs b/src/ffi/key_cache.rs new file mode 100644 index 0000000..235d717 --- /dev/null +++ b/src/ffi/key_cache.rs @@ -0,0 +1,122 @@ +//! Direct-mapped intern cache for short map keys. atproto-shape payloads +//! reuse a small vocabulary (`$type`, `did`, `cid`, `uri`, `text`, ...) per +//! record; caching the constructed `PyUnicode` + its `Py_hash_t` skips both +//! the rebuild and the rehash inside `PyDict_SetItem`. + +// Cached variant: CPython with the GIL (single-threaded access to the static). +#[cfg(all(CPython, not(Py_GIL_DISABLED)))] +mod cached { + use pyo3::{ffi, prelude::*}; + + use crate::ffi::string::from_bytes; + + const CAP: usize = 2048; + const MAX_KEY_LEN: usize = 64; + + struct Entry { + len: u16, + bytes: [u8; MAX_KEY_LEN], + obj: *mut ffi::PyObject, + hash: ffi::Py_hash_t, + } + + impl Entry { + const fn empty() -> Self { + Self { + len: 0, + bytes: [0; MAX_KEY_LEN], + obj: std::ptr::null_mut(), + hash: 0, + } + } + } + + static mut SLOTS: [Entry; CAP] = [const { Entry::empty() }; CAP]; + + #[inline] + fn fx_hash(bytes: &[u8]) -> usize { + const K: u64 = 0x517c_c1b7_2722_0a95; + let mut h: u64 = 0; + for &b in bytes { + h = (h.rotate_left(5) ^ b as u64).wrapping_mul(K); + } + h as usize + } + + /// Returns `(strong-ref PyUnicode*, Py_hash_t)`. Caller owns one ref. + /// Caller must hold the GIL (we are always called from a `Python<'_>`). + #[inline] + pub(crate) unsafe fn intern( + py: Python<'_>, + bytes: &[u8], + ) -> PyResult<(*mut ffi::PyObject, ffi::Py_hash_t)> { + if bytes.len() > MAX_KEY_LEN { + return build(py, bytes); + } + + let slot_idx = fx_hash(bytes) & (CAP - 1); + // `&raw mut` is the supported path to a `static mut`; the explicit + // re-borrow keeps the field accesses readable. Clippy's `deref_addrof` + // suggestion would re-introduce `static_mut_refs`. + #[allow(clippy::deref_addrof)] + let slot = &mut *(&raw mut SLOTS[slot_idx]); + + if slot.len as usize == bytes.len() + && !slot.obj.is_null() + && slot.bytes[..bytes.len()] == *bytes + { + ffi::Py_INCREF(slot.obj); + return Ok((slot.obj, slot.hash)); + } + + let (obj, hash) = build(py, bytes)?; + // Evict the previous occupant before claiming the slot. + if !slot.obj.is_null() { + ffi::Py_DECREF(slot.obj); + } + // One ref for the cache, one for the caller. + ffi::Py_INCREF(obj); + slot.obj = obj; + slot.hash = hash; + slot.len = bytes.len() as u16; + slot.bytes[..bytes.len()].copy_from_slice(bytes); + Ok((obj, hash)) + } + + #[inline] + unsafe fn build( + py: Python<'_>, + bytes: &[u8], + ) -> PyResult<(*mut ffi::PyObject, ffi::Py_hash_t)> { + let s = from_bytes(py, bytes)?; + let ptr = s.as_ptr(); + let hash = ffi::PyObject_Hash(ptr); + if hash == -1 { + return Err(PyErr::fetch(py)); + } + Ok((s.into_ptr(), hash)) + } +} + +#[cfg(all(CPython, not(Py_GIL_DISABLED)))] +pub(crate) use cached::intern; + +// Non-CPython / free-threaded fallback: no cache, just build the string and +// compute its hash inline. +#[cfg(not(all(CPython, not(Py_GIL_DISABLED))))] +pub(crate) unsafe fn intern( + py: pyo3::Python<'_>, + bytes: &[u8], +) -> pyo3::PyResult<(*mut pyo3::ffi::PyObject, pyo3::ffi::Py_hash_t)> { + use pyo3::{ffi, prelude::*}; + + use crate::ffi::string::from_bytes; + + let s = from_bytes(py, bytes)?; + let ptr = s.as_ptr(); + let hash = ffi::PyObject_Hash(ptr); + if hash == -1 { + return Err(PyErr::fetch(py)); + } + Ok((s.into_ptr(), hash)) +} diff --git a/src/ffi/recursion.rs b/src/ffi/recursion.rs new file mode 100644 index 0000000..e9505c4 --- /dev/null +++ b/src/ffi/recursion.rs @@ -0,0 +1,10 @@ +use pyo3::ffi; + +// Snapshot `sys.getrecursionlimit()` once per top-level decode call and pass +// it through. Calling `ffi::Py_GetRecursionLimit()` from the hot path costs +// ~5–10 ns per recursive step, which dominates on scalar-dense payloads +// (canada makes 111k+ recursive calls, one per float). +#[inline] +pub(crate) fn current_recursion_limit() -> usize { + unsafe { ffi::Py_GetRecursionLimit() as usize } +} diff --git a/src/ffi/string.rs b/src/ffi/string.rs new file mode 100644 index 0000000..c5d9ac7 --- /dev/null +++ b/src/ffi/string.rs @@ -0,0 +1,35 @@ +use pyo3::prelude::*; +use pyo3::types::PyString; + +#[cfg(CPython)] +use pyo3::ffi; + +// `PyUnicode_DecodeUTF8` runs a state machine even on pure-ASCII input. Skip +// it by allocating a compact-ASCII `PyUnicode` and memcpying into its inline +// buffer; non-ASCII falls through to the standard decoder. +#[cfg(CPython)] +#[inline] +pub(crate) fn from_bytes<'py>(py: Python<'py>, bytes: &[u8]) -> PyResult> { + if !bytes.is_ascii() { + return PyString::from_bytes(py, bytes); + } + + unsafe { + let obj = ffi::PyUnicode_New(bytes.len() as ffi::Py_ssize_t, 127); + if obj.is_null() { + return Err(PyErr::fetch(py)); + } + + let data = obj.cast::().offset(1).cast::(); + std::ptr::copy_nonoverlapping(bytes.as_ptr(), data, bytes.len()); + *data.add(bytes.len()) = 0; + + Ok(Bound::from_owned_ptr(py, obj).cast_into_unchecked::()) + } +} + +#[cfg(not(CPython))] +#[inline] +pub(crate) fn from_bytes<'py>(py: Python<'py>, bytes: &[u8]) -> PyResult> { + PyString::from_bytes(py, bytes) +} diff --git a/src/ffi/sys.rs b/src/ffi/sys.rs new file mode 100644 index 0000000..6ca7488 --- /dev/null +++ b/src/ffi/sys.rs @@ -0,0 +1,13 @@ +//! Private CPython symbols; not provided by pyo3-ffi and CPython-only. + +use pyo3::ffi; + +extern "C" { + pub(crate) fn _PyDict_NewPresized(minused: ffi::Py_ssize_t) -> *mut ffi::PyObject; + pub(crate) fn _PyDict_SetItem_KnownHash( + op: *mut ffi::PyObject, + key: *mut ffi::PyObject, + value: *mut ffi::PyObject, + hash: ffi::Py_hash_t, + ) -> std::os::raw::c_int; +} diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 0000000..b8f02d8 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,9 @@ +//! IO primitives shared by the codecs: an in-memory reader, a `Vec`-backed +//! writer, and the LEB128 varint reader used by the CAR container format. + +pub(crate) mod leb128; +pub(crate) mod reader; +pub(crate) mod writer; + +pub(crate) use reader::{peek_one, SliceReader}; +pub(crate) use writer::VecWriter; diff --git a/src/io/leb128.rs b/src/io/leb128.rs new file mode 100644 index 0000000..aa5f8f3 --- /dev/null +++ b/src/io/leb128.rs @@ -0,0 +1,40 @@ +use anyhow::{anyhow, Result}; +use cbor4ii::core::dec; + +use crate::io::reader::peek_one; + +#[inline] +pub(crate) fn read_u64<'de, R: dec::Read<'de>>(r: &mut R) -> Result +where + R::Error: Send + Sync, +{ + let mut result: u64 = 0; + let mut shift = 0; + + loop { + let byte = + peek_one(r).map_err(|_| anyhow!("Unexpected EOF while reading ULEB128 number."))?; + r.advance(1); + + if shift == 63 && byte != 0x00 && byte != 0x01 { + // consume remaining continuation bytes so reader stays in sync + let mut b = byte; + while b & 0x80 != 0 { + b = peek_one(r).map_err(|_| { + anyhow!("Unexpected EOF while skipping overflowing ULEB128 number.") + })?; + r.advance(1); + } + return Err(anyhow!("ULEB128 overflow")); + } + + let low_bits = (byte & !0x80) as u64; + result |= low_bits << shift; + + if byte & 0x80 == 0 { + return Ok(result); + } + + shift += 7; + } +} diff --git a/src/io/reader.rs b/src/io/reader.rs new file mode 100644 index 0000000..f16d6e2 --- /dev/null +++ b/src/io/reader.rs @@ -0,0 +1,42 @@ +use anyhow::{anyhow, Result}; +use cbor4ii::core::dec; + +// Based on cbor4ii/src/utils.rs. +/// An in-memory reader. +pub(crate) struct SliceReader<'a> { + pub(crate) buf: &'a [u8], +} + +impl SliceReader<'_> { + pub(crate) fn new(buf: &[u8]) -> SliceReader<'_> { + SliceReader { buf } + } +} + +impl<'de> dec::Read<'de> for SliceReader<'de> { + type Error = core::convert::Infallible; + + #[inline] + fn fill<'b>(&'b mut self, want: usize) -> Result, Self::Error> { + let len = core::cmp::min(self.buf.len(), want); + Ok(dec::Reference::Long(&self.buf[..len])) + } + + #[inline] + fn advance(&mut self, n: usize) { + let len = core::cmp::min(self.buf.len(), n); + self.buf = &self.buf[len..]; + } +} + +// Based on cbor4ii code. +pub(crate) fn peek_one<'de, R: dec::Read<'de>>(r: &mut R) -> Result +where + R::Error: Send + Sync, +{ + r.fill(1)? + .as_ref() + .first() + .copied() + .ok_or_else(|| anyhow!("end of data")) +} diff --git a/src/io/writer.rs b/src/io/writer.rs new file mode 100644 index 0000000..ea79670 --- /dev/null +++ b/src/io/writer.rs @@ -0,0 +1,29 @@ +use std::convert::Infallible; + +use cbor4ii::core::enc; + +// `enc::Write` over a raw `Vec`: no syscalls behind it, so a `BufWriter` +// wrapper would just add a memcpy per push for no benefit. +pub(crate) struct VecWriter(Vec); + +impl VecWriter { + #[inline] + pub(crate) fn new() -> Self { + VecWriter(Vec::new()) + } + + #[inline] + pub(crate) fn as_slice(&self) -> &[u8] { + &self.0 + } +} + +impl enc::Write for VecWriter { + type Error = Infallible; + + #[inline] + fn push(&mut self, input: &[u8]) -> Result<(), Self::Error> { + self.0.extend_from_slice(input); + Ok(()) + } +} diff --git a/src/lib.rs b/src/lib.rs index ce2c003..96370b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,991 +1,28 @@ -use std::convert::Infallible; +use pyo3::prelude::*; -use anyhow::{anyhow, Result}; -use cbor4ii::core::{ - dec::{self, Decode, Read}, - enc::{self, Encode}, - major, marker, types, -}; -use cid::{multibase, Cid}; -use pyo3::pybacked::PyBackedStr; -use pyo3::{ffi, prelude::*, types::*, BoundObject, Python}; - -// Private CPython symbols; not provided by pyo3-ffi and CPython-only. -#[cfg(CPython)] -extern "C" { - fn _PyDict_NewPresized(minused: ffi::Py_ssize_t) -> *mut ffi::PyObject; - fn _PyDict_SetItem_KnownHash( - op: *mut ffi::PyObject, - key: *mut ffi::PyObject, - value: *mut ffi::PyObject, - hash: ffi::Py_hash_t, - ) -> std::os::raw::c_int; -} - -// Empty CPython dicts already have 8 slots, so presizing below that buys -// nothing and lets us stay on the public `PyDict_New` path. -#[inline] -unsafe fn new_presized_dict(len: usize) -> *mut ffi::PyObject { - #[cfg(CPython)] - { - if len > 8 { - _PyDict_NewPresized(len as ffi::Py_ssize_t) - } else { - ffi::PyDict_New() - } - } - #[cfg(not(CPython))] - { - let _ = len; - ffi::PyDict_New() - } -} - -// `enc::Write` over a raw `Vec`: no syscalls behind it, so a `BufWriter` -// wrapper would just add a memcpy per push for no benefit. -struct VecWriter(Vec); - -impl VecWriter { - #[inline] - fn new() -> Self { - VecWriter(Vec::new()) - } - - #[inline] - fn as_slice(&self) -> &[u8] { - &self.0 - } -} - -impl enc::Write for VecWriter { - type Error = Infallible; - - #[inline] - fn push(&mut self, input: &[u8]) -> Result<(), Self::Error> { - self.0.extend_from_slice(input); - Ok(()) - } -} - -// Based on cbor4ii/src/utils.rs. -/// An in-memory reader. -struct SliceReader<'a> { - buf: &'a [u8], -} - -impl SliceReader<'_> { - fn new(buf: &[u8]) -> SliceReader<'_> { - SliceReader { buf } - } -} - -impl<'de> dec::Read<'de> for SliceReader<'de> { - type Error = core::convert::Infallible; - - #[inline] - fn fill<'b>(&'b mut self, want: usize) -> Result, Self::Error> { - let len = core::cmp::min(self.buf.len(), want); - Ok(dec::Reference::Long(&self.buf[..len])) - } - - #[inline] - fn advance(&mut self, n: usize) { - let len = core::cmp::min(self.buf.len(), n); - self.buf = &self.buf[len..]; - } -} - -struct PrefixedCidBytes<'a>(&'a [u8]); - -impl<'a> Encode for PrefixedCidBytes<'a> { - fn encode(&self, w: &mut W) -> Result<(), enc::Error> { - // length prefix for bytes: 1 (leading 0) + payload - types::Bytes::bounded(1 + self.0.len(), w)?; - w.push(&[0x00])?; - w.push(self.0)?; - Ok(()) - } -} - -fn cid_hash_to_pydict<'py>(py: Python<'py>, cid: &Cid) -> PyResult> { - let hash = cid.hash(); - let dict_obj = PyDict::new(py); - - dict_obj.set_item("code", hash.code())?; - dict_obj.set_item("size", hash.size())?; - dict_obj.set_item("digest", PyBytes::new(py, hash.digest()))?; - - Ok(dict_obj) -} - -fn cid_to_pydict<'py>(py: Python<'py>, cid: &Cid) -> PyResult> { - let dict_obj = PyDict::new(py); - - dict_obj.set_item("version", cid.version() as u64)?; - dict_obj.set_item("codec", cid.codec())?; - dict_obj.set_item("hash", cid_hash_to_pydict(py, cid)?)?; - Ok(dict_obj) -} - -fn map_key_cmp(a: &[u8], b: &[u8]) -> std::cmp::Ordering { - /* The keys in every map must be sorted length-first by the byte representation of the string keys, where: - - If two keys have different lengths, the shorter one sorts earlier; - - If two keys have the same length, the one with the lower value in (byte-wise) lexical order sorts earlier. - */ - if a.len() != b.len() { - a.len().cmp(&b.len()) - } else { - a.cmp(b) - } -} - -// One dict walk collects (key, value) pairs together; sorting by-index and -// re-fetching values through `map.values()` would materialize two extra -// PyLists and walk the dict three times. -fn collect_and_sort_map_entries<'py>( - map: &Bound<'py, PyDict>, -) -> Result)>> { - let len = map.len(); - let mut entries: Vec<(PyBackedStr, Bound<'py, PyAny>)> = Vec::with_capacity(len); - - for (key, value) in map.iter() { - let key_str = match key.cast_into::() { - Ok(k) => k, - Err(_) => return Err(anyhow!("Map keys must be strings")), - }; - let backed = PyBackedStr::try_from(key_str) - .map_err(|_| anyhow!("Failed to convert PyString to PyBackedStr"))?; - entries.push((backed, value)); - } - - if entries.len() >= 2 { - entries.sort_by(|a, b| { - // sort_unstable_by performs bad in past benchmarks; revisit if data shape changes. - let (s1, _) = a; - let (s2, _) = b; - if s1.len() != s2.len() { - s1.len().cmp(&s2.len()) - } else { - s1.as_bytes().cmp(s2.as_bytes()) - } - }); - } - - Ok(entries) -} - -// `PyUnicode_DecodeUTF8` runs a state machine even on pure-ASCII input. Skip -// it by allocating a compact-ASCII `PyUnicode` and memcpying into its inline -// buffer; non-ASCII falls through to the standard decoder. -#[cfg(CPython)] -#[inline] -fn pystring_from_bytes_fast<'py>(py: Python<'py>, bytes: &[u8]) -> PyResult> { - if !bytes.is_ascii() { - return PyString::from_bytes(py, bytes); - } - - unsafe { - let obj = ffi::PyUnicode_New(bytes.len() as ffi::Py_ssize_t, 127); - if obj.is_null() { - return Err(PyErr::fetch(py)); - } - - let data = obj.cast::().offset(1).cast::(); - std::ptr::copy_nonoverlapping(bytes.as_ptr(), data, bytes.len()); - *data.add(bytes.len()) = 0; - - Ok(Bound::from_owned_ptr(py, obj).cast_into_unchecked::()) - } -} - -#[cfg(not(CPython))] -#[inline] -fn pystring_from_bytes_fast<'py>(py: Python<'py>, bytes: &[u8]) -> PyResult> { - PyString::from_bytes(py, bytes) -} - -// Direct-mapped intern cache for short map keys. atproto-shape payloads -// reuse a small vocabulary (`$type`, `did`, `cid`, `uri`, `text`, ...) per -// record; caching the constructed `PyUnicode` + its `Py_hash_t` skips both -// the rebuild and the rehash inside `PyDict_SetItem` -#[cfg(all(CPython, not(Py_GIL_DISABLED)))] -mod key_cache { - use super::pystring_from_bytes_fast; - use pyo3::{ffi, prelude::*}; - - const CAP: usize = 2048; - const MAX_KEY_LEN: usize = 64; - - struct Entry { - len: u16, - bytes: [u8; MAX_KEY_LEN], - obj: *mut ffi::PyObject, - hash: ffi::Py_hash_t, - } - - impl Entry { - const fn empty() -> Self { - Self { - len: 0, - bytes: [0; MAX_KEY_LEN], - obj: std::ptr::null_mut(), - hash: 0, - } - } - } - - static mut SLOTS: [Entry; CAP] = [const { Entry::empty() }; CAP]; - - #[inline] - fn fx_hash(bytes: &[u8]) -> usize { - const K: u64 = 0x517c_c1b7_2722_0a95; - let mut h: u64 = 0; - for &b in bytes { - h = (h.rotate_left(5) ^ b as u64).wrapping_mul(K); - } - h as usize - } - - /// Returns `(strong-ref PyUnicode*, Py_hash_t)`. Caller owns one ref. - /// Caller must hold the GIL (we are always called from a `Python<'_>`). - #[inline] - pub(super) unsafe fn intern_key( - py: Python<'_>, - bytes: &[u8], - ) -> PyResult<(*mut ffi::PyObject, ffi::Py_hash_t)> { - if bytes.len() > MAX_KEY_LEN { - return build(py, bytes); - } - - let slot_idx = fx_hash(bytes) & (CAP - 1); - // `&raw mut` is the supported path to a `static mut`; the explicit - // re-borrow keeps the field accesses readable. Clippy's `deref_addrof` - // suggestion would re-introduce `static_mut_refs`. - #[allow(clippy::deref_addrof)] - let slot = &mut *(&raw mut SLOTS[slot_idx]); - - if slot.len as usize == bytes.len() - && !slot.obj.is_null() - && slot.bytes[..bytes.len()] == *bytes - { - ffi::Py_INCREF(slot.obj); - return Ok((slot.obj, slot.hash)); - } - - let (obj, hash) = build(py, bytes)?; - // Evict the previous occupant before claiming the slot. - if !slot.obj.is_null() { - ffi::Py_DECREF(slot.obj); - } - // One ref for the cache, one for the caller. - ffi::Py_INCREF(obj); - slot.obj = obj; - slot.hash = hash; - slot.len = bytes.len() as u16; - slot.bytes[..bytes.len()].copy_from_slice(bytes); - Ok((obj, hash)) - } - - #[inline] - unsafe fn build( - py: Python<'_>, - bytes: &[u8], - ) -> PyResult<(*mut ffi::PyObject, ffi::Py_hash_t)> { - let s = pystring_from_bytes_fast(py, bytes)?; - let ptr = s.as_ptr(); - let hash = ffi::PyObject_Hash(ptr); - if hash == -1 { - return Err(PyErr::fetch(py)); - } - Ok((s.into_ptr(), hash)) - } -} - -// Non-CPython / free-threaded fallback: no cache, just build the string and compute its hash inline -#[cfg(not(all(CPython, not(Py_GIL_DISABLED))))] -mod key_cache { - use super::pystring_from_bytes_fast; - use pyo3::{ffi, prelude::*}; - - #[inline] - pub(super) unsafe fn intern_key( - py: Python<'_>, - bytes: &[u8], - ) -> PyResult<(*mut ffi::PyObject, ffi::Py_hash_t)> { - let s = pystring_from_bytes_fast(py, bytes)?; - let ptr = s.as_ptr(); - let hash = ffi::PyObject_Hash(ptr); - if hash == -1 { - return Err(PyErr::fetch(py)); - } - Ok((s.into_ptr(), hash)) - } -} - -fn get_bytes_from_py_any<'py>(obj: &'py Bound<'py, PyAny>) -> PyResult<&'py [u8]> { - if let Ok(b) = obj.cast::() { - Ok(b.as_bytes()) - } else if let Ok(ba) = obj.cast::() { - Ok(unsafe { ba.as_bytes() }) - } else if let Ok(s) = obj.cast::() { - Ok(s.to_str()?.as_bytes()) - } else { - Err(get_err( - "Failed to encode multibase", - "Unsupported data type".to_string(), - )) - } -} - -// Based on cbor4ii code. -fn peek_one<'de, R: dec::Read<'de>>(r: &mut R) -> Result -where - R::Error: Send + Sync, -{ - r.fill(1)? - .as_ref() - .first() - .copied() - .ok_or_else(|| anyhow!("end of data")) -} - -// Snapshot `sys.getrecursionlimit()` once per top-level decode call and pass -// it through. Calling `ffi::Py_GetRecursionLimit()` from the hot path costs -// ~5–10 ns per recursive step, which dominates on scalar-dense payloads -// (canada makes 111k+ recursive calls, one per float). -#[inline] -fn current_recursion_limit() -> usize { - unsafe { ffi::Py_GetRecursionLimit() as usize } -} - -fn decode_dag_cbor_to_pyobject<'de, R: dec::Read<'de>>( - py: Python, - r: &mut R, - depth: usize, - max_depth: usize, -) -> Result> -where - R::Error: Send + Sync, -{ - if depth > max_depth { - PyErr::new::( - "RecursionError: maximum recursion depth exceeded in DAG-CBOR decoding", - ) - .restore(py); - - return Err(anyhow!("Maximum recursion depth exceeded")); - } - - let byte = peek_one(r)?; - Ok(match dec::if_major(byte) { - major::UNSIGNED => u64::decode(r)?.into_pyobject(py)?.into(), - major::NEGATIVE => i128::decode(r)?.into_pyobject(py)?.into(), - major::BYTES => PyBytes::new(py, >::decode(r)?.0) - .into_pyobject(py)? - .into(), - major::STRING => { - // ASCII fast path inside the helper; non-ASCII falls through to - // `PyUnicode_DecodeUTF8`, which is where the spec validation lives. - pystring_from_bytes_fast( - py, - >::decode(r) - .map_err(|_| anyhow!("Cannot decode as bytes"))? - .0, - )? - .into() - } - major::ARRAY => { - let len: ffi::Py_ssize_t = types::Array::len(r)? - .ok_or_else(|| anyhow!("Array must contain length"))? - .try_into()?; - - unsafe { - let ptr = ffi::PyList_New(len); - - for i in 0..len { - ffi::PyList_SET_ITEM( - ptr, - i, - decode_dag_cbor_to_pyobject(py, r, depth + 1, max_depth)?.into_ptr(), - ); - } - - let list: Bound<'_, PyList> = Bound::from_owned_ptr(py, ptr).cast_into_unchecked(); - list.into_pyobject(py)?.into() - } - } - major::MAP => { - let len = types::Map::len(r)?.ok_or_else(|| anyhow!("Map must contain length"))?; - // Length is known up front; presize to avoid rehashes as we fill. - let dict = unsafe { - let ptr = new_presized_dict(len); - if ptr.is_null() { - return Err(anyhow!(PyErr::fetch(py))); - } - Bound::from_owned_ptr(py, ptr).cast_into_unchecked::() - }; - - let mut prev_key: Option<&[u8]> = None; - for _ in 0..len { - // DAG-CBOR keys are always strings. Python does the UTF-8 validation when creating - // the string. - let key = >::decode(r) - .map_err(|_| anyhow!("Map keys must be strings"))? - .0; - - if let Some(prev_key) = prev_key { - // it cares about duplicated keys too thanks to Ordering::Equal - if map_key_cmp(prev_key, key) != std::cmp::Ordering::Less { - return Err(anyhow!("Map keys must be sorted and unique")); - } - } - - prev_key = Some(key); - - let (key_ptr, key_hash) = unsafe { key_cache::intern_key(py, key)? }; - let key_bound: Bound<'_, PyAny> = unsafe { Bound::from_owned_ptr(py, key_ptr) }; - - let value_py = decode_dag_cbor_to_pyobject(py, r, depth + 1, max_depth)?; - - #[cfg(CPython)] - unsafe { - let value_ptr = value_py.into_ptr(); - let rc = _PyDict_SetItem_KnownHash( - dict.as_ptr(), - key_bound.as_ptr(), - value_ptr, - key_hash, - ); - ffi::Py_DECREF(value_ptr); - if rc != 0 { - return Err(anyhow!(PyErr::fetch(py))); - } - } - #[cfg(not(CPython))] - { - let _ = key_hash; - dict.set_item(&key_bound, value_py)?; - } - } - - dict.into_pyobject(py)?.into() - } - major::TAG => { - let value = types::Tag::tag(r)?; - if value != 42 { - return Err(anyhow!("Non-42 tags are not supported")); - } - - let cid = >::decode(r)?.0; - - // we expect CIDs to have a leading zero byte - if cid.len() <= 1 || cid[0] != 0 { - return Err(anyhow!("Invalid CID")); - } - - let cid_without_prefix = &cid[1..]; - if Cid::try_from(cid_without_prefix).is_err() { - return Err(anyhow!("Invalid CID")); - } - - PyBytes::new(py, cid_without_prefix) - .into_pyobject(py)? - .into() - } - major::SIMPLE => match byte { - // FIXME(MarshalX): should be more clear for bool? - marker::FALSE => { - r.advance(1); - false.into_pyobject(py)?.into_any().unbind() - } - marker::TRUE => { - r.advance(1); - true.into_pyobject(py)?.into_any().unbind() - } - marker::NULL => { - r.advance(1); - py.None() - } - marker::F32 => { - let value = f32::decode(r)?; - if !value.is_finite() { - return Err(anyhow!( - "Number out of range for f32 (NaNs are forbidden)".to_string() - )); - } - value.into_pyobject(py)?.into() - } - marker::F64 => { - let value = f64::decode(r)?; - if !value.is_finite() { - return Err(anyhow!( - "Number out of range for f64 (NaNs are forbidden)".to_string() - )); - } - value.into_pyobject(py)?.into() - } - _ => return Err(anyhow!("Unsupported major type".to_string())), - }, - _ => return Err(anyhow!("Invalid major type".to_string())), - }) -} - -// `Cid::try_from` parses two varints + a multihash on every call; this O(1) -// shape check rejects payloads that can't be a CID without paying for it. -// CIDv1 starts with `0x01`; CIDv0 is exactly 34 bytes starting `0x12 0x20`. -#[inline] -fn looks_like_cid(bytes: &[u8]) -> bool { - if bytes.len() < 4 { - return false; - } - if bytes[0] == 0x01 { - return true; - } - bytes.len() == 34 && bytes[0] == 0x12 && bytes[1] == 0x20 -} - -fn encode_dag_cbor_from_pyobject<'py, W: enc::Write>( - _py: Python<'py>, - obj: &Bound<'py, PyAny>, - w: &mut W, -) -> Result<()> -where - W::Error: Send + Sync, -{ - // Exact-type pointer compare per branch avoids the MRO walk that - // `is_instance_of` / `cast` perform. Order tuned for typical ATProto - // record shapes; subclasses fall through to the slow path below. - let tp = unsafe { ffi::Py_TYPE(obj.as_ptr()) }; - unsafe { - if tp == &raw mut ffi::PyUnicode_Type { - let s = obj.cast_unchecked::(); - s.to_str()?.encode(w)?; - return Ok(()); - } - if tp == &raw mut ffi::PyDict_Type { - let map = obj.cast_unchecked::(); - let entries = collect_and_sort_map_entries(map)?; - types::Map::bounded(entries.len(), w)?; - for (key, value) in &entries { - (&**key).encode(w)?; - encode_dag_cbor_from_pyobject(_py, value, w)?; - } - return Ok(()); - } - if tp == &raw mut ffi::PyList_Type { - let l = obj.cast_unchecked::(); - let len = l.len(); - types::Array::bounded(len, w)?; - for i in 0..len { - let item = l.get_item_unchecked(i); - encode_dag_cbor_from_pyobject(_py, &item, w)?; - } - return Ok(()); - } - if tp == &raw mut ffi::PyLong_Type { - return encode_int(obj, w); - } - if tp == &raw mut ffi::PyBytes_Type { - let b = obj.cast_unchecked::(); - let bytes = b.as_bytes(); - if looks_like_cid(bytes) && Cid::try_from(bytes).is_ok() { - // by providing custom encoding we avoid extra allocation - types::Tag(42, PrefixedCidBytes(bytes)).encode(w)?; - } else { - types::Bytes(bytes).encode(w)?; - } - return Ok(()); - } - if tp == &raw mut ffi::PyBool_Type { - (obj.as_ptr() == ffi::Py_True()).encode(w)?; - return Ok(()); - } - if obj.as_ptr() == ffi::Py_None() { - types::Null.encode(w)?; - return Ok(()); - } - if tp == &raw mut ffi::PyFloat_Type { - let f = obj.cast_unchecked::(); - let v = f.value(); - if !v.is_finite() { - return Err(anyhow!("Number out of range")); - } - v.encode(w)?; - return Ok(()); - } - } - - // Slow path: subclasses of supported types (rare in DAG-CBOR usage). - if obj.is_instance_of::() { - (obj.as_ptr() == unsafe { ffi::Py_True() }).encode(w)?; - Ok(()) - } else if obj.is_instance_of::() { - encode_int(obj, w) - } else if let Ok(l) = obj.cast::() { - let len = l.len(); - types::Array::bounded(len, w)?; - for i in 0..len { - let item = unsafe { l.get_item_unchecked(i) }; - encode_dag_cbor_from_pyobject(_py, &item, w)?; - } - Ok(()) - } else if let Ok(map) = obj.cast::() { - let entries = collect_and_sort_map_entries(map)?; - types::Map::bounded(entries.len(), w)?; - for (key, value) in &entries { - (&**key).encode(w)?; - encode_dag_cbor_from_pyobject(_py, value, w)?; - } - Ok(()) - } else if let Ok(s) = obj.cast::() { - s.to_str()?.encode(w)?; - Ok(()) - } else if let Ok(b) = obj.cast::() { - let bytes = b.as_bytes(); - if looks_like_cid(bytes) && Cid::try_from(bytes).is_ok() { - types::Tag(42, PrefixedCidBytes(bytes)).encode(w)?; - } else { - types::Bytes(bytes).encode(w)?; - } - Ok(()) - } else if let Ok(f) = obj.cast::() { - let v = f.value(); - if !v.is_finite() { - return Err(anyhow!("Number out of range")); - } - v.encode(w)?; - Ok(()) - } else { - Err(anyhow!("Unknown tag")) - } -} - -// CPython 3.12+ PyLongObject layout: `PyObject_HEAD; uintptr_t lv_tag; digit ob_digit[]`. -// `lv_tag` packs the sign in the low 3 bits (0=positive, 1=zero, 2=negative) and the -// digit count in the upper bits. Default builds use 30-bit digits (uint32_t). -#[cfg(all(CPython, Py_3_12))] -#[inline] -unsafe fn pylong_to_dag_int_fast(obj: *mut ffi::PyObject) -> Option<(u64, bool)> { - const NON_SIZE_BITS: u32 = 3; - const SIGN_MASK: usize = 3; - const SIGN_NEGATIVE: usize = 2; - const PYLONG_DIGIT_BITS: u32 = 30; - - let lv_tag_ptr = (obj as *const u8).add(std::mem::size_of::()) as *const usize; - let lv_tag = *lv_tag_ptr; - let ndigits = lv_tag >> NON_SIZE_BITS; - let neg = (lv_tag & SIGN_MASK) == SIGN_NEGATIVE; - - let ob_digit = lv_tag_ptr.add(1) as *const u32; - let abs_val: u64 = match ndigits { - 0 => return Some((0, false)), - 1 => *ob_digit as u64, - 2 => (*ob_digit as u64) | ((*ob_digit.add(1) as u64) << PYLONG_DIGIT_BITS), - _ => return None, - }; - Some((abs_val, neg)) -} - -#[inline] -fn encode_int(obj: &Bound<'_, PyAny>, w: &mut W) -> Result<()> -where - W::Error: Send + Sync, -{ - #[cfg(all(CPython, Py_3_12))] - { - if let Some((abs_val, neg)) = unsafe { pylong_to_dag_int_fast(obj.as_ptr()) } { - if neg { - types::Negative(abs_val - 1).encode(w)?; - } else { - abs_val.encode(w)?; - } - return Ok(()); - } - } - - let i: i128 = obj.extract()?; - if i.is_negative() { - if -(i + 1) > u64::MAX as i128 { - return Err(anyhow!("Number out of range")); - } - types::Negative(-(i + 1) as u64).encode(w)?; - } else { - if i > u64::MAX as i128 { - return Err(anyhow!("Number out of range")); - } - (i as u64).encode(w)?; - } - Ok(()) -} - -#[pyfunction] -fn decode_dag_cbor_multi<'py>(py: Python<'py>, data: &[u8]) -> PyResult> { - let mut reader = SliceReader::new(data); - let decoded_parts = PyList::empty(py); - let max_depth = current_recursion_limit(); - - loop { - let py_object = decode_dag_cbor_to_pyobject(py, &mut reader, 0, max_depth); - if let Ok(py_object) = py_object { - decoded_parts.append(py_object)?; - } else { - break; - } - } - - Ok(decoded_parts) -} - -#[inline] -fn read_u64_leb128<'de, R: dec::Read<'de>>(r: &mut R) -> Result -where - R::Error: Send + Sync, -{ - let mut result: u64 = 0; - let mut shift = 0; - - loop { - let byte = - peek_one(r).map_err(|_| anyhow!("Unexpected EOF while reading ULEB128 number."))?; - r.advance(1); - - if shift == 63 && byte != 0x00 && byte != 0x01 { - // consume remaining continuation bytes so reader stays in sync - let mut b = byte; - while b & 0x80 != 0 { - b = peek_one(r).map_err(|_| { - anyhow!("Unexpected EOF while skipping overflowing ULEB128 number.") - })?; - r.advance(1); - } - return Err(anyhow!("ULEB128 overflow")); - } - - let low_bits = (byte & !0x80) as u64; - result |= low_bits << shift; - - if byte & 0x80 == 0 { - return Ok(result); - } - - shift += 7; - } -} - -#[pyfunction] -pub fn decode_car<'py>(py: Python<'py>, data: &[u8]) -> PyResult<(Py, Bound<'py, PyDict>)> { - let buf = &mut SliceReader::new(data); - let max_depth = current_recursion_limit(); - - if read_u64_leb128(buf).is_err() { - return Err(get_err( - "Failed to read CAR header", - "Invalid uvarint".to_string(), - )); - } - let Ok(header_obj) = decode_dag_cbor_to_pyobject(py, buf, 0, max_depth) else { - return Err(get_err( - "Failed to read CAR header", - "Invalid DAG-CBOR".to_string(), - )); - }; - - let header = header_obj.cast_bound::(py)?; - - let Some(version) = header.get_item("version")? else { - return Err(get_err( - "Failed to read CAR header", - "Version is None".to_string(), - )); - }; - if version.cast::()?.extract::()? != 1 { - return Err(get_err( - "Failed to read CAR header", - "Unsupported version. Version must be 1".to_string(), - )); - } - - let Some(roots) = header.get_item("roots")? else { - return Err(get_err( - "Failed to read CAR header", - "Roots is None".to_string(), - )); - }; - if roots.cast::()?.len() == 0 { - return Err(get_err( - "Failed to read CAR header", - "Roots is empty. Must be at least one".to_string(), - )); - } - - // FIXME (MarshalX): we are not verifying if the roots are valid CIDs - - let parsed_blocks = PyDict::new(py); - - loop { - if read_u64_leb128(buf).is_err() { - // FIXME (MarshalX): we are not raising an error here because of possible EOF - break; - } - - let cid_bytes_before = buf.buf; - // `&[u8]` is itself an `io::Read`, so we hand it to `Cid::read_bytes` - // directly and recover the consumed length from the slice shrink. - let mut slice: &[u8] = cid_bytes_before; - let cid_result = Cid::read_bytes(&mut slice); - let Ok(cid) = cid_result else { - return Err(get_err( - "Failed to read CID of block", - cid_result.unwrap_err().to_string(), - )); - }; - - if cid.codec() != 0x71 { - return Err(get_err( - "Failed to read CAR block", - "Unsupported codec. For now we support only DAG-CBOR (0x71)".to_string(), - )); - } - - let consumed = cid_bytes_before.len() - slice.len(); - buf.advance(consumed); - let cid_raw = &cid_bytes_before[..consumed]; - - let block_result = decode_dag_cbor_to_pyobject(py, buf, 0, max_depth); - let Ok(block) = block_result else { - return Err(get_err( - "Failed to read CAR block", - block_result.unwrap_err().to_string(), - )); - }; - - let key = PyBytes::new(py, cid_raw).into_pyobject(py)?; - parsed_blocks.set_item(key, block)?; - } - - Ok((header_obj, parsed_blocks)) -} - -#[pyfunction] -pub fn decode_dag_cbor(py: Python, data: &[u8]) -> PyResult> { - let mut reader = SliceReader::new(data); - let max_depth = current_recursion_limit(); - let py_object = decode_dag_cbor_to_pyobject(py, &mut reader, 0, max_depth); - if let Ok(py_object) = py_object { - // check for any remaining data in the reader - if reader.fill(1)?.as_ref().is_empty() { - Ok(py_object) - } else { - Err(get_err( - "Failed to decode DAG-CBOR", - "Invalid DAG-CBOR: contains multiple objects (CBOR sequence)".to_string(), - )) - } - } else { - let err = get_err( - "Failed to decode DAG-CBOR", - py_object.unwrap_err().to_string(), - ); - - if let Some(py_err) = PyErr::take(py) { - py_err.set_cause(py, Option::from(err)); - // in case something set global interpreter’s error, - // for example C FFI function, we should return it - // the real case: RecursionError (set by Py_EnterRecursiveCall) - Err(py_err) - } else { - Err(err) - } - } -} - -#[pyfunction] -pub fn encode_dag_cbor<'py>( - py: Python<'py>, - data: &Bound<'py, PyAny>, -) -> PyResult> { - let mut buf = VecWriter::new(); - if let Err(e) = encode_dag_cbor_from_pyobject(py, data, &mut buf) { - return Err(get_err("Failed to encode DAG-CBOR", e.to_string())); - } - Ok(PyBytes::new(py, buf.as_slice())) -} - -fn get_cid_from_py_any(data: &Bound) -> PyResult { - let cid = if let Ok(s) = data.cast::() { - Cid::try_from(s.to_str()?) - } else { - Cid::try_from(get_bytes_from_py_any(data)?) - }; - - if let Ok(cid) = cid { - Ok(cid) - } else { - Err(get_err( - "Failed to decode CID", - cid.unwrap_err().to_string(), - )) - } -} - -#[pyfunction] -fn decode_cid<'py>(py: Python<'py>, data: &Bound) -> PyResult> { - cid_to_pydict(py, &get_cid_from_py_any(data)?) -} - -#[pyfunction] -fn encode_cid<'py>(py: Python<'py>, data: &Bound) -> PyResult> { - Ok(PyString::new( - py, - get_cid_from_py_any(data)?.to_string().as_str(), - )) -} - -#[pyfunction] -fn decode_multibase<'py>(py: Python<'py>, data: &str) -> PyResult<(char, Bound<'py, PyBytes>)> { - let base = multibase::decode(data); - if let Ok((base, data)) = base { - Ok((base.code(), PyBytes::new(py, &data))) - } else { - Err(get_err( - "Failed to decode multibase", - base.unwrap_err().to_string(), - )) - } -} - -#[pyfunction] -fn encode_multibase(code: char, data: &Bound) -> PyResult { - let data_bytes = get_bytes_from_py_any(data)?; - let base = multibase::Base::from_code(code); - if let Ok(base) = base { - Ok(multibase::encode(base, data_bytes)) - } else { - Err(get_err( - "Failed to encode multibase", - base.unwrap_err().to_string(), - )) - } -} - -fn get_err(msg: &str, err: String) -> PyErr { - PyErr::new::(format!("{}. {}", msg, err)) -} +mod car; +mod cid; +mod convert; +mod dag_cbor; +mod error; +mod ffi; +mod io; +mod multibase; #[pymodule] #[pyo3(name = "_libipld")] fn libipld(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_function(wrap_pyfunction!(decode_cid, m)?)?; - m.add_function(wrap_pyfunction!(encode_cid, m)?)?; + m.add_function(wrap_pyfunction!(cid::decode_cid, m)?)?; + m.add_function(wrap_pyfunction!(cid::encode_cid, m)?)?; - m.add_function(wrap_pyfunction!(decode_car, m)?)?; + m.add_function(wrap_pyfunction!(car::decode_car, m)?)?; - m.add_function(wrap_pyfunction!(decode_dag_cbor, m)?)?; - m.add_function(wrap_pyfunction!(decode_dag_cbor_multi, m)?)?; - m.add_function(wrap_pyfunction!(encode_dag_cbor, m)?)?; + m.add_function(wrap_pyfunction!(dag_cbor::decode_dag_cbor, m)?)?; + m.add_function(wrap_pyfunction!(dag_cbor::decode_dag_cbor_multi, m)?)?; + m.add_function(wrap_pyfunction!(dag_cbor::encode_dag_cbor, m)?)?; - m.add_function(wrap_pyfunction!(decode_multibase, m)?)?; - m.add_function(wrap_pyfunction!(encode_multibase, m)?)?; + m.add_function(wrap_pyfunction!(multibase::decode_multibase, m)?)?; + m.add_function(wrap_pyfunction!(multibase::encode_multibase, m)?)?; Ok(()) } diff --git a/src/multibase.rs b/src/multibase.rs new file mode 100644 index 0000000..36ca77a --- /dev/null +++ b/src/multibase.rs @@ -0,0 +1,7 @@ +//! Multibase string codec (encode/decode of self-describing base encodings). + +pub(crate) mod de; +pub(crate) mod ser; + +pub(crate) use de::decode_multibase; +pub(crate) use ser::encode_multibase; diff --git a/src/multibase/de.rs b/src/multibase/de.rs new file mode 100644 index 0000000..de78e1a --- /dev/null +++ b/src/multibase/de.rs @@ -0,0 +1,17 @@ +use pyo3::prelude::*; +use pyo3::types::*; + +use crate::error::value_error; + +#[pyfunction] +pub fn decode_multibase<'py>(py: Python<'py>, data: &str) -> PyResult<(char, Bound<'py, PyBytes>)> { + let base = ::cid::multibase::decode(data); + if let Ok((base, data)) = base { + Ok((base.code(), PyBytes::new(py, &data))) + } else { + Err(value_error( + "Failed to decode multibase", + base.unwrap_err().to_string(), + )) + } +} diff --git a/src/multibase/ser.rs b/src/multibase/ser.rs new file mode 100644 index 0000000..9004eb2 --- /dev/null +++ b/src/multibase/ser.rs @@ -0,0 +1,18 @@ +use pyo3::prelude::*; + +use crate::convert::extract_bytes; +use crate::error::value_error; + +#[pyfunction] +pub fn encode_multibase(code: char, data: &Bound) -> PyResult { + let data_bytes = extract_bytes(data)?; + let base = ::cid::multibase::Base::from_code(code); + if let Ok(base) = base { + Ok(::cid::multibase::encode(base, data_bytes)) + } else { + Err(value_error( + "Failed to encode multibase", + base.unwrap_err().to_string(), + )) + } +}