From b7a1fb232b283ea766439e6bfa1791dd8529ad35 Mon Sep 17 00:00:00 2001 From: Joel Capitao Date: Thu, 30 Jun 2022 16:33:02 +0200 Subject: [PATCH] Replace unicodecsv by standard csv module unicodecsv is not maintained since a while now [1]. It was preferred over standard csv because of the unicode support. Now that Python3 csv module [2] supports it, let's use it. For more context, we hit issues while rebuilding uncicodecsv during Fedora Python3.11 mass rebuild [3][4]. [1] https://github.com/jdunck/python-unicodecsv [2] https://docs.python.org/3/library/csv.html [3] https://copr.fedorainfracloud.org/coprs/g/python/python3.11/package/python-unicodecsv/ [4 https://bugzilla.redhat.com/show_bug.cgi?id=2021938] --- rows/plugins/plugin_csv.py | 26 +++++++++++++------------- setup.py | 5 ++--- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/rows/plugins/plugin_csv.py b/rows/plugins/plugin_csv.py index d018a213..d92821f3 100644 --- a/rows/plugins/plugin_csv.py +++ b/rows/plugins/plugin_csv.py @@ -20,34 +20,34 @@ from io import BytesIO import six -import unicodecsv +import csv from rows.plugins.utils import create_table, ipartition, serialize from rows.utils import Source -sniffer = unicodecsv.Sniffer() +sniffer = csv.Sniffer() # Some CSV files have more than 128kB of data in a cell, so we force this value # to be greater (16MB). # TODO: check if it impacts in memory usage. # TODO: may add option to change it by passing a parameter to import/export. -unicodecsv.field_size_limit(16777216) +csv.field_size_limit(16777216) def fix_dialect(dialect): if not dialect.doublequote and dialect.escapechar is None: dialect.doublequote = True - if dialect.quoting == unicodecsv.QUOTE_MINIMAL and dialect.quotechar == "'": + if dialect.quoting == csv.QUOTE_MINIMAL and dialect.quotechar == "'": # Python csv's Sniffer seems to detect a wrong quotechar when # quoting is minimal dialect.quotechar = '"' -class excel_semicolon(unicodecsv.excel): +class excel_semicolon(csv.excel): delimiter = ";" -unicodecsv.register_dialect("excel-semicolon", excel_semicolon) +csv.register_dialect("excel-semicolon", excel_semicolon) if six.PY2: @@ -60,8 +60,8 @@ def discover_dialect(sample, encoding=None, delimiters=(b",", b";", b"\t", b"|") try: dialect = sniffer.sniff(sample, delimiters=delimiters) - except unicodecsv.Error: # Couldn't detect: fall back to 'excel' - dialect = unicodecsv.excel + except csv.Error: # Couldn't detect: fall back to 'excel' + dialect = csv.excel fix_dialect(dialect) return dialect @@ -96,8 +96,8 @@ def discover_dialect(sample, encoding, delimiters=(",", ";", "\t", "|")): try: dialect = sniffer.sniff(decoded, delimiters=delimiters) - except unicodecsv.Error: # Couldn't detect: fall back to 'excel' - dialect = unicodecsv.excel + except csv.Error: # Couldn't detect: fall back to 'excel' + dialect = csv.excel fix_dialect(dialect) return dialect @@ -133,7 +133,7 @@ def import_from_csv( sample=read_sample(source.fobj, sample_size), encoding=source.encoding ) - reader = unicodecsv.reader(source.fobj, encoding=encoding, dialect=dialect) + reader = csv.reader(source.fobj, encoding=encoding, dialect=dialect) meta = {"imported_from": "csv", "source": source} return create_table(reader, meta=meta, *args, **kwargs) @@ -143,7 +143,7 @@ def export_to_csv( table, filename_or_fobj=None, encoding="utf-8", - dialect=unicodecsv.excel, + dialect=csv.excel, batch_size=100, callback=None, *args, @@ -176,7 +176,7 @@ def export_to_csv( # TODO: may use `io.BufferedWriter` instead of `ipartition` so user can # choose the real size (in Bytes) when to flush to the file system, instead # number of rows - writer = unicodecsv.writer(source.fobj, encoding=encoding, dialect=dialect) + writer = csv.writer(source.fobj, encoding=encoding, dialect=dialect) if callback is None: for batch in ipartition(serialize(table, *args, **kwargs), batch_size): diff --git a/setup.py b/setup.py index 5fcfc355..91765cdc 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ utils_requirements = ["requests", "requests-cache", "tqdm"] EXTRA_REQUIREMENTS = { "cli": ["click"] + utils_requirements, - "csv": ["unicodecsv"], "detect": ["file-magic"], "html": ["lxml"], # apt: libxslt-dev libxml2-dev "ods": ["lxml"], @@ -46,10 +45,10 @@ } EXTRA_REQUIREMENTS["all"] = sum(EXTRA_REQUIREMENTS.values(), []) INSTALL_REQUIREMENTS = [ - "dataclasses", + "dataclasses", "six", "requests", -] + EXTRA_REQUIREMENTS["csv"] +] LONG_DESCRIPTION = """ No matter in which format your tabular data is: rows will import it, automatically detect types and give you high-level Python objects so you can