From f6dfd38f2ae5be2ad04f0c31395b392d50f8ab6a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 18 Feb 2026 20:43:09 +0000 Subject: [PATCH 1/7] constrain pandas patch types to match csv semantics read_nsv now infers numeric types per-column (like read_csv) instead of leaving everything as strings. to_nsv converts non-string values to str and NaN to empty string before writing. --- nsv/__init__.py | 17 +++++-- tests/test_pandas.py | 106 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 tests/test_pandas.py diff --git a/nsv/__init__.py b/nsv/__init__.py index 46d7124..7925f3a 100644 --- a/nsv/__init__.py +++ b/nsv/__init__.py @@ -13,17 +13,26 @@ def patch_pandas(): return pd = sys.modules['pandas'] - def read_nsv(filepath_or_buffer, **kwargs): + def read_nsv(filepath_or_buffer, dtype=None, **kwargs): if isinstance(filepath_or_buffer, str): with open(filepath_or_buffer, 'r') as f: data = load(f) else: data = load(filepath_or_buffer) - return pd.DataFrame(data) + df = pd.DataFrame(data) + if dtype is not None: + df = df.astype(dtype) + else: + for col in df.columns: + converted = pd.to_numeric(df[col], errors='coerce') + # Keep only if no non-empty values were coerced to NaN + lost = converted.isna() & (df[col] != '') & df[col].notna() + if not lost.any(): + df[col] = converted + return df def to_nsv(self, path_or_buf=None, **kwargs): - # TODO: this is naive, pandas can have non-string values - data = self.values + data = [['' if pd.isna(v) else str(v) for v in row] for row in self.values] if path_or_buf is None: return dumps(data) diff --git a/tests/test_pandas.py b/tests/test_pandas.py new file mode 100644 index 0000000..fc0f623 --- /dev/null +++ b/tests/test_pandas.py @@ -0,0 +1,106 @@ +import unittest +from io import StringIO + +import pandas as pd +import numpy as np + +import nsv + + +def setUpModule(): + nsv.patch_pandas() + + +class TestReadNsvTypeInference(unittest.TestCase): + """read_nsv should infer types the same way read_csv does.""" + + def _compare_with_csv(self, rows): + """Assert that read_nsv produces the same dtypes and values as read_csv.""" + nsv_str = nsv.dumps(rows) + csv_str = '\n'.join(','.join(row) for row in rows) + '\n' + + nsv_df = pd.read_nsv(StringIO(nsv_str)) + csv_df = pd.read_csv(StringIO(csv_str), header=None) + + self.assertEqual(list(nsv_df.dtypes), list(csv_df.dtypes), + f"dtype mismatch for rows={rows}") + pd.testing.assert_frame_equal(nsv_df, csv_df) + + def test_integers(self): + self._compare_with_csv([['1', '2'], ['3', '4']]) + + def test_floats(self): + self._compare_with_csv([['1.5', '2.5'], ['3.5', '4.5']]) + + def test_mixed_int_float(self): + self._compare_with_csv([['1', '2.5'], ['3', '4.5']]) + + def test_strings(self): + self._compare_with_csv([['hello', 'world'], ['foo', 'bar']]) + + def test_mixed_numeric_and_string(self): + self._compare_with_csv([['123', 'abc'], ['456', 'def']]) + + def test_empty_fields_in_numeric_column(self): + self._compare_with_csv([['1', 'a'], ['', 'b'], ['3', 'c']]) + + def test_scientific_notation(self): + self._compare_with_csv([['1.23e5', '4.56e-2'], ['7.89e1', '0.12e3']]) + + def test_negative_numbers(self): + self._compare_with_csv([['-1', '-2.5'], ['3', '4.5']]) + + def test_all_empty(self): + self._compare_with_csv([['', ''], ['', '']]) + + +class TestReadNsvDtype(unittest.TestCase): + """read_nsv should support explicit dtype parameter.""" + + def test_dtype_str_suppresses_inference(self): + data = [['123', '456'], ['789', '012']] + nsv_str = nsv.dumps(data) + df = pd.read_nsv(StringIO(nsv_str), dtype=str) + for col in df.columns: + self.assertFalse(pd.api.types.is_numeric_dtype(df[col])) + self.assertEqual(df.iloc[0, 0], '123') + + def test_dtype_per_column(self): + data = [['123', '4.5'], ['789', '6.7']] + nsv_str = nsv.dumps(data) + df = pd.read_nsv(StringIO(nsv_str), dtype={0: float, 1: str}) + self.assertTrue(pd.api.types.is_float_dtype(df[0])) + self.assertFalse(pd.api.types.is_numeric_dtype(df[1])) + + +class TestToNsv(unittest.TestCase): + """to_nsv should handle non-string types gracefully.""" + + def test_roundtrip_integers(self): + df = pd.DataFrame({0: [1, 2, 3], 1: [4, 5, 6]}) + nsv_str = df.to_nsv() + self.assertIsInstance(nsv_str, str) + df2 = pd.read_nsv(StringIO(nsv_str)) + pd.testing.assert_frame_equal(df, df2) + + def test_roundtrip_floats(self): + df = pd.DataFrame({0: [1.5, 2.5], 1: [3.5, 4.5]}) + nsv_str = df.to_nsv() + df2 = pd.read_nsv(StringIO(nsv_str)) + pd.testing.assert_frame_equal(df, df2) + + def test_roundtrip_mixed(self): + df = pd.DataFrame({0: [1, 2], 1: ['x', 'y']}) + nsv_str = df.to_nsv() + df2 = pd.read_nsv(StringIO(nsv_str)) + pd.testing.assert_frame_equal(df, df2) + + def test_nan_becomes_empty(self): + df = pd.DataFrame({'a': [1.0, float('nan'), 3.0]}) + nsv_str = df.to_nsv() + rows = nsv.loads(nsv_str) + self.assertEqual(rows[1], ['']) + + +if __name__ == '__main__': + unittest.main() From 27fd74ba74a4300c5b7ec423566cabd76dd423c4 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 14 Mar 2026 19:33:25 +0000 Subject: [PATCH 2/7] Fix three bugs caught in code review - reader.py: check() appended a bare int instead of a (pos, line, col) tuple when the string ends with a trailing backslash, which would crash the subsequent unpacking loop - core.py: remove unused loop variable i in dumps() - test_utils.py: load_then_dump() splatted the list into dumps() instead of passing it as a single iterable argument --- nsv/core.py | 2 +- nsv/reader.py | 2 +- tests/test_utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nsv/core.py b/nsv/core.py index 72a4be2..7dc6214 100644 --- a/nsv/core.py +++ b/nsv/core.py @@ -30,7 +30,7 @@ def dump(data: Iterable[Iterable[str]], file_obj): def dumps(data: Iterable[Iterable[str]]) -> str: """Write elements to an NSV string.""" lines = [] - for i, row in enumerate(data): + for row in data: for cell in row: lines.append(Writer.escape(cell)) lines.append('') diff --git a/nsv/reader.py b/nsv/reader.py index 173239d..76e0345 100644 --- a/nsv/reader.py +++ b/nsv/reader.py @@ -63,7 +63,7 @@ def check(s: str): else: col += 1 if escaped: - sus.append(len(s) - 1) + sus.append((len(s) - 1, line, col)) for pos, line, col in sus: print(f'WARNING: Unescaped backslash at position {pos} ({line}:{col})') if s[-1] != '\n': diff --git a/tests/test_utils.py b/tests/test_utils.py index 0237151..802f130 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -42,7 +42,7 @@ def dump_then_load(data): def load_then_dump(s): - return nsv.dumps(*nsv.loads(s)) + return nsv.dumps(nsv.loads(s)) def load_sample(name): From a2e35a8369cfa9ed027e6494c7ce55d52dde517e Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 14 Mar 2026 20:17:20 +0000 Subject: [PATCH 3/7] Match read_csv auto-inference exactly: NA strings and booleans - Apply pandas' default NA value set (NA, NaN, nan, null, None, etc.) before type inference so NA strings become NaN in all column types, matching read_csv behaviour - Detect all-true/false columns (case-insensitive) and cast to bool; bool+NA columns return object with Python bools and NaN, also matching read_csv behaviour - Refactor inference into _infer_column() helper - Add TestReadNsvNullInference and TestReadNsvBoolInference test classes, all using read_csv as the oracle --- nsv/__init__.py | 28 ++++++++++++++---- tests/test_pandas.py | 67 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/nsv/__init__.py b/nsv/__init__.py index 7925f3a..dd6cca6 100644 --- a/nsv/__init__.py +++ b/nsv/__init__.py @@ -6,12 +6,34 @@ FEATURES = {} +_BOOL_VALUES = frozenset({'true', 'false'}) + + def patch_pandas(): """Add NSV support to pandas if available in context.""" import sys if 'pandas' not in sys.modules: return pd = sys.modules['pandas'] + from pandas.io.parsers.readers import STR_NA_VALUES + + def _infer_column(col): + """Infer dtype to match read_csv auto-detection.""" + na_mask = col.isin(STR_NA_VALUES) + col_na = col.where(~na_mask) + + # Numeric: accept if no non-NA values are lost + converted = pd.to_numeric(col_na, errors='coerce') + if not (converted.isna() & col_na.notna()).any(): + return converted + + # Bool: all non-NA values must be true/false (case-insensitive) + non_na = col_na.dropna() + if len(non_na) > 0 and non_na.str.lower().isin(_BOOL_VALUES).all(): + as_bool = col_na.map(lambda x: x.lower() == 'true' if pd.notna(x) else x) + return as_bool if na_mask.any() else as_bool.astype(bool) + + return col_na def read_nsv(filepath_or_buffer, dtype=None, **kwargs): if isinstance(filepath_or_buffer, str): @@ -24,11 +46,7 @@ def read_nsv(filepath_or_buffer, dtype=None, **kwargs): df = df.astype(dtype) else: for col in df.columns: - converted = pd.to_numeric(df[col], errors='coerce') - # Keep only if no non-empty values were coerced to NaN - lost = converted.isna() & (df[col] != '') & df[col].notna() - if not lost.any(): - df[col] = converted + df[col] = _infer_column(df[col]) return df def to_nsv(self, path_or_buf=None, **kwargs): diff --git a/tests/test_pandas.py b/tests/test_pandas.py index fc0f623..ee2c628 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -54,6 +54,73 @@ def test_all_empty(self): self._compare_with_csv([['', ''], ['', '']]) +class TestReadNsvNullInference(unittest.TestCase): + """read_nsv should treat the same strings as NaN that read_csv does.""" + + def _compare_with_csv(self, rows): + nsv_str = nsv.dumps(rows) + csv_str = '\n'.join(','.join(row) for row in rows) + '\n' + nsv_df = pd.read_nsv(StringIO(nsv_str)) + csv_df = pd.read_csv(StringIO(csv_str), header=None) + self.assertEqual(list(nsv_df.dtypes), list(csv_df.dtypes), + f"dtype mismatch for rows={rows}") + pd.testing.assert_frame_equal(nsv_df, csv_df) + + def test_na_string_in_numeric_column(self): + self._compare_with_csv([['NA', '1'], ['2', '3']]) + + def test_nan_string_in_numeric_column(self): + self._compare_with_csv([['NaN', '1'], ['2', '3']]) + + def test_nan_lowercase_in_numeric_column(self): + self._compare_with_csv([['nan', '1'], ['2', '3']]) + + def test_null_string_in_numeric_column(self): + self._compare_with_csv([['null', '1'], ['2', '3']]) + + def test_none_string_in_numeric_column(self): + self._compare_with_csv([['None', '1'], ['2', '3']]) + + def test_na_string_in_string_column(self): + self._compare_with_csv([['hello', 'NA'], ['world', 'there']]) + + def test_all_na_column(self): + self._compare_with_csv([['NA', 'a'], ['NaN', 'b'], ['null', 'c']]) + + +class TestReadNsvBoolInference(unittest.TestCase): + """read_nsv should infer bool columns the same way read_csv does.""" + + def _compare_with_csv(self, rows): + nsv_str = nsv.dumps(rows) + csv_str = '\n'.join(','.join(row) for row in rows) + '\n' + nsv_df = pd.read_nsv(StringIO(nsv_str)) + csv_df = pd.read_csv(StringIO(csv_str), header=None) + self.assertEqual(list(nsv_df.dtypes), list(csv_df.dtypes), + f"dtype mismatch for rows={rows}") + pd.testing.assert_frame_equal(nsv_df, csv_df) + + def test_bool_true_false(self): + self._compare_with_csv([['True', 'False'], ['True', 'False']]) + + def test_bool_lowercase(self): + self._compare_with_csv([['true', 'false'], ['true', 'false']]) + + def test_bool_uppercase(self): + self._compare_with_csv([['TRUE', 'FALSE'], ['TRUE', 'FALSE']]) + + def test_bool_mixed_case(self): + self._compare_with_csv([['True', 'false'], ['FALSE', 'True']]) + + def test_bool_with_na(self): + # NA mixed in: read_csv returns object with Python bools and nan + self._compare_with_csv([['True', 'a'], ['NA', 'b'], ['False', 'c']]) + + def test_not_bool_T_F(self): + # 'T'/'F' are NOT inferred as bool by read_csv + self._compare_with_csv([['T', 'F'], ['T', 'F']]) + + class TestReadNsvDtype(unittest.TestCase): """read_nsv should support explicit dtype parameter.""" From 0186ec81411a42208128037ccbf4a1c538c79415 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 14 Mar 2026 20:23:58 +0000 Subject: [PATCH 4/7] Skip pandas tests gracefully when pandas is not installed CI runs without pandas (it's an optional dependency). Guard all test classes with @skip_no_pandas so the suite passes without it. --- tests/test_pandas.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index ee2c628..29f869c 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -1,8 +1,12 @@ import unittest from io import StringIO -import pandas as pd -import numpy as np +try: + import pandas as pd + import numpy as np + HAS_PANDAS = True +except ImportError: + HAS_PANDAS = False import nsv @@ -11,6 +15,10 @@ def setUpModule(): nsv.patch_pandas() +skip_no_pandas = unittest.skipUnless(HAS_PANDAS, 'pandas not installed') + + +@skip_no_pandas class TestReadNsvTypeInference(unittest.TestCase): """read_nsv should infer types the same way read_csv does.""" @@ -54,6 +62,7 @@ def test_all_empty(self): self._compare_with_csv([['', ''], ['', '']]) +@skip_no_pandas class TestReadNsvNullInference(unittest.TestCase): """read_nsv should treat the same strings as NaN that read_csv does.""" @@ -88,6 +97,7 @@ def test_all_na_column(self): self._compare_with_csv([['NA', 'a'], ['NaN', 'b'], ['null', 'c']]) +@skip_no_pandas class TestReadNsvBoolInference(unittest.TestCase): """read_nsv should infer bool columns the same way read_csv does.""" @@ -121,6 +131,7 @@ def test_not_bool_T_F(self): self._compare_with_csv([['T', 'F'], ['T', 'F']]) +@skip_no_pandas class TestReadNsvDtype(unittest.TestCase): """read_nsv should support explicit dtype parameter.""" @@ -140,6 +151,7 @@ def test_dtype_per_column(self): self.assertFalse(pd.api.types.is_numeric_dtype(df[1])) +@skip_no_pandas class TestToNsv(unittest.TestCase): """to_nsv should handle non-string types gracefully.""" From 1d5cbe422a441ddac1b1d1c23ce236f0e306aa83 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 14 Mar 2026 20:26:36 +0000 Subject: [PATCH 5/7] Install pandas in CI and revert test skip guards The pandas extra was not being installed in CI. Switch to pip install -e ".[pandas]" so the tests actually run. Revert the skipUnless guards added in the previous commit. --- .github/workflows/tests.yml | 2 +- tests/test_pandas.py | 16 ++-------------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5671273..0ad3df6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -24,7 +24,7 @@ jobs: - name: Install package run: | python -m pip install --upgrade pip - pip install -e . + pip install -e ".[pandas]" - name: Run tests run: python -m unittest discover -s tests -p 'test*.py' -v diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 29f869c..ee2c628 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -1,12 +1,8 @@ import unittest from io import StringIO -try: - import pandas as pd - import numpy as np - HAS_PANDAS = True -except ImportError: - HAS_PANDAS = False +import pandas as pd +import numpy as np import nsv @@ -15,10 +11,6 @@ def setUpModule(): nsv.patch_pandas() -skip_no_pandas = unittest.skipUnless(HAS_PANDAS, 'pandas not installed') - - -@skip_no_pandas class TestReadNsvTypeInference(unittest.TestCase): """read_nsv should infer types the same way read_csv does.""" @@ -62,7 +54,6 @@ def test_all_empty(self): self._compare_with_csv([['', ''], ['', '']]) -@skip_no_pandas class TestReadNsvNullInference(unittest.TestCase): """read_nsv should treat the same strings as NaN that read_csv does.""" @@ -97,7 +88,6 @@ def test_all_na_column(self): self._compare_with_csv([['NA', 'a'], ['NaN', 'b'], ['null', 'c']]) -@skip_no_pandas class TestReadNsvBoolInference(unittest.TestCase): """read_nsv should infer bool columns the same way read_csv does.""" @@ -131,7 +121,6 @@ def test_not_bool_T_F(self): self._compare_with_csv([['T', 'F'], ['T', 'F']]) -@skip_no_pandas class TestReadNsvDtype(unittest.TestCase): """read_nsv should support explicit dtype parameter.""" @@ -151,7 +140,6 @@ def test_dtype_per_column(self): self.assertFalse(pd.api.types.is_numeric_dtype(df[1])) -@skip_no_pandas class TestToNsv(unittest.TestCase): """to_nsv should handle non-string types gracefully.""" From f7a63f1cceba9c32b97e34661009abf1c07f928f Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 14 Mar 2026 22:47:50 +0000 Subject: [PATCH 6/7] Replace hand-rolled type inference with read_csv Instead of reimplementing pandas' bool/NA/numeric detection, convert NSV rows to CSV in memory and pass to read_csv directly. --- nsv/__init__.py | 38 ++++++++------------------------------ 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/nsv/__init__.py b/nsv/__init__.py index dd6cca6..3f2b017 100644 --- a/nsv/__init__.py +++ b/nsv/__init__.py @@ -6,48 +6,26 @@ FEATURES = {} -_BOOL_VALUES = frozenset({'true', 'false'}) - - def patch_pandas(): """Add NSV support to pandas if available in context.""" import sys + import io + import csv if 'pandas' not in sys.modules: return pd = sys.modules['pandas'] - from pandas.io.parsers.readers import STR_NA_VALUES - - def _infer_column(col): - """Infer dtype to match read_csv auto-detection.""" - na_mask = col.isin(STR_NA_VALUES) - col_na = col.where(~na_mask) - - # Numeric: accept if no non-NA values are lost - converted = pd.to_numeric(col_na, errors='coerce') - if not (converted.isna() & col_na.notna()).any(): - return converted - # Bool: all non-NA values must be true/false (case-insensitive) - non_na = col_na.dropna() - if len(non_na) > 0 and non_na.str.lower().isin(_BOOL_VALUES).all(): - as_bool = col_na.map(lambda x: x.lower() == 'true' if pd.notna(x) else x) - return as_bool if na_mask.any() else as_bool.astype(bool) - - return col_na - - def read_nsv(filepath_or_buffer, dtype=None, **kwargs): + def read_nsv(filepath_or_buffer, **kwargs): if isinstance(filepath_or_buffer, str): with open(filepath_or_buffer, 'r') as f: data = load(f) else: data = load(filepath_or_buffer) - df = pd.DataFrame(data) - if dtype is not None: - df = df.astype(dtype) - else: - for col in df.columns: - df[col] = _infer_column(df[col]) - return df + + buf = io.StringIO() + csv.writer(buf).writerows(data) + buf.seek(0) + return pd.read_csv(buf, header=None, **kwargs) def to_nsv(self, path_or_buf=None, **kwargs): data = [['' if pd.isna(v) else str(v) for v in row] for row in self.values] From ce1c4e352c6aa227bbc46b7643f1b24dda4403ff Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 14 Mar 2026 22:50:52 +0000 Subject: [PATCH 7/7] Scope bool_values inside patch_pandas, drop CSV roundtrip Keep type inference local to patch_pandas rather than leaking constants into module scope. No CSV serialization overhead. --- nsv/__init__.py | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/nsv/__init__.py b/nsv/__init__.py index 3f2b017..be6d222 100644 --- a/nsv/__init__.py +++ b/nsv/__init__.py @@ -9,23 +9,41 @@ def patch_pandas(): """Add NSV support to pandas if available in context.""" import sys - import io - import csv if 'pandas' not in sys.modules: return pd = sys.modules['pandas'] + from pandas.io.parsers.readers import STR_NA_VALUES - def read_nsv(filepath_or_buffer, **kwargs): + bool_values = frozenset({'true', 'false'}) + + def _infer_column(col): + na_mask = col.isin(STR_NA_VALUES) + col_na = col.where(~na_mask) + + converted = pd.to_numeric(col_na, errors='coerce') + if not (converted.isna() & col_na.notna()).any(): + return converted + + non_na = col_na.dropna() + if len(non_na) > 0 and non_na.str.lower().isin(bool_values).all(): + as_bool = col_na.map(lambda x: x.lower() == 'true' if pd.notna(x) else x) + return as_bool if na_mask.any() else as_bool.astype(bool) + + return col_na + + def read_nsv(filepath_or_buffer, dtype=None, **kwargs): if isinstance(filepath_or_buffer, str): with open(filepath_or_buffer, 'r') as f: data = load(f) else: data = load(filepath_or_buffer) - - buf = io.StringIO() - csv.writer(buf).writerows(data) - buf.seek(0) - return pd.read_csv(buf, header=None, **kwargs) + df = pd.DataFrame(data) + if dtype is not None: + df = df.astype(dtype) + else: + for col in df.columns: + df[col] = _infer_column(df[col]) + return df def to_nsv(self, path_or_buf=None, **kwargs): data = [['' if pd.isna(v) else str(v) for v in row] for row in self.values]