diff --git a/rows/__init__.py b/rows/__init__.py
index 90211297..efc0505a 100644
--- a/rows/__init__.py
+++ b/rows/__init__.py
@@ -74,5 +74,8 @@
if plugins.pdf:
import_from_pdf = plugins.pdf.import_from_pdf
+if plugins.ocr:
+ import_from_image = plugins.ocr.import_from_image
+
__version__ = "0.4.2dev0"
diff --git a/rows/plugins/__init__.py b/rows/plugins/__init__.py
index 9bb2933f..838edf97 100644
--- a/rows/plugins/__init__.py
+++ b/rows/plugins/__init__.py
@@ -64,3 +64,8 @@
from . import plugin_pdf as pdf
except ImportError:
pdf = None
+
+try:
+ from . import plugin_ocr as ocr
+except ImportError:
+ ocr = None
diff --git a/rows/plugins/plugin_ocr.py b/rows/plugins/plugin_ocr.py
new file mode 100644
index 00000000..cd1357f5
--- /dev/null
+++ b/rows/plugins/plugin_ocr.py
@@ -0,0 +1,104 @@
+# coding: utf-8
+
+# Copyright 2014-2019 Álvaro Justen
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see .
+
+from __future__ import unicode_literals
+
+from cached_property import cached_property
+from pytesseract import image_to_boxes
+from PIL import Image
+
+from rows.plugins.plugin_pdf import PDFBackend, TextObject, pdf_table_lines
+from rows.plugins.utils import create_table
+from rows.plugins.utils_rect import join_contiguous_rects
+
+
+class TesseractBackend(PDFBackend):
+
+ name = "tesseract"
+
+ def __init__(self, filename_or_fobj, language):
+ self.filename_or_fobj = filename_or_fobj
+ self.language = language
+ super().__init__(self.filename_or_fobj)
+
+ @cached_property
+ def document(self):
+ if hasattr(self.filename_or_fobj, "read"):
+ image = Image.open(self.filename_or_fobj)
+ else:
+ image = self.filename_or_fobj
+
+ return image
+
+ @cached_property
+ def number_of_pages(self):
+ return 1 # TODO: fix
+
+ def extract_text(self, page_numbers=None):
+ return "" # TODO: image_to_string
+
+ def objects(self, page_numbers=None, starts_after=None, ends_before=None):
+ header = "char left bottom right top page".split()
+ boxes = image_to_boxes(self.document, lang=self.language).splitlines()
+ text_objs = []
+ max_width = 0
+ for box in boxes:
+ row = {}
+ for key, value in zip(header, box.split()):
+ if key != "char":
+ value = int(value)
+ row[key] = value
+ obj = TextObject(
+ x0=row["left"],
+ y0=row["bottom"],
+ x1=row["right"],
+ y1=row["top"],
+ text=row["char"],
+ )
+ text_objs.append(obj)
+ max_width = max(max_width, row["right"] - row["left"])
+
+ text_objs.sort(key=lambda obj: (obj.y0, obj.x0))
+ # group contiguous objects before yielding
+ text_objs = join_contiguous_rects(text_objs, tolerance=max_width)
+ yield text_objs
+
+ text_objects = objects
+
+
+def import_from_image(
+ filename_or_fobj,
+ language="eng",
+ algorithm="y-groups",
+ x_threshold=1.0,
+ y_threshold=1.0,
+ *args,
+ **kwargs
+):
+ meta = {"imported_from": "image"}
+ table_rows = pdf_table_lines(
+ filename_or_fobj,
+ None,
+ starts_after=None,
+ ends_before=None,
+ algorithm=algorithm,
+ x_threshold=x_threshold,
+ y_threshold=y_threshold,
+ backend=TesseractBackend,
+ backend_kwargs={"language": language},
+ )
+ return create_table(table_rows, meta=meta, *args, **kwargs)
diff --git a/rows/plugins/plugin_pdf.py b/rows/plugins/plugin_pdf.py
index 80fa8ebc..d8561841 100644
--- a/rows/plugins/plugin_pdf.py
+++ b/rows/plugins/plugin_pdf.py
@@ -714,13 +714,17 @@ def pdf_table_lines(
x_threshold=0.5,
y_threshold=0.5,
backend=None,
+ backend_args=None,
+ backend_kwargs=None,
):
backend = backend or default_backend()
# TODO: check if both backends accepts filename or fobj
Backend = get_backend(backend)
Algorithm = get_algorithm(algorithm)
- pdf_doc = Backend(filename_or_fobj)
+ backend_args = backend_args or []
+ backend_kwargs = backend_kwargs or {}
+ pdf_doc = Backend(filename_or_fobj, *backend_args, **backend_kwargs)
pages = pdf_doc.objects(
page_numbers=page_numbers, starts_after=starts_after, ends_before=ends_before
diff --git a/rows/plugins/utils_rect.py b/rows/plugins/utils_rect.py
new file mode 100644
index 00000000..3a14a13f
--- /dev/null
+++ b/rows/plugins/utils_rect.py
@@ -0,0 +1,98 @@
+from copy import copy
+
+
+SIDES = "left top right bottom".split()
+
+
+class Rect:
+ def __init__(self, rect):
+ #rect = {key: value for key, value in rect.items() if key in SIDES}
+ self.__dict__.update(rect)
+
+ def __hash__(self):
+ return hash((self.left, self.top, self.right, self.bottom))
+
+ def __getitem__(self, item):
+ return self.__dict__[item]
+
+ def __setitem__(self, item, value):
+ self.__dict__[item] = value
+
+ def __eq__(self, other):
+ return all(self[side] == other[side] for side in SIDES)
+
+ def __repr__(self):
+ return "<{left}, {top}, {right}, {bottom}>".format(**self.__dict__)
+
+
+def consolidate(new_rect, rect1, rect2):
+ if new_rect is None:
+ new_rect = copy(rect1)
+ for op, side in zip((min, max, max, min), SIDES):
+ new_rect[side] = op(r[side] for r in (new_rect, rect1, rect2))
+
+ return new_rect
+
+
+def mag(x, y):
+ return x ** 2 + y ** 2
+
+
+def find_paired_rects(rects, tolerance):
+
+ rects_by_left = {}
+ for r in rects:
+ rects_by_left.setdefault(r.left, []).append(r)
+
+ left_right_pairs = []
+ paired = {}
+ for rect in rects:
+ mag_alignment = None
+ for offset_x in range(-tolerance, tolerance + 1):
+ if (rect.right + offset_x) not in rects_by_left:
+ continue
+ for aligned_rect in rects_by_left[rect.right + offset_x]:
+ if aligned_rect is rect: continue
+ for offset_y in range(-tolerance, tolerance + 1):
+ if (rect.top + offset_y) == aligned_rect.top:
+ new_mag = mag(offset_x, offset_y)
+ if mag_alignment is None or new_mag < mag_alignment:
+ paired[rect] = aligned_rect
+ mag_alignment = new_mag
+
+ return paired
+
+
+def join_contiguous_rects(rect_dicts, tolerance=1):
+ rects = [Rect(rect) for rect in rect_dicts]
+
+ paired = find_paired_rects(rects, tolerance)
+
+ consolidated = []
+ to_remove = set()
+
+ for rect in sorted(rects, key=lambda r:r.left):
+ if rect in to_remove:
+ continue
+ new_rect = None
+ chars = ""
+ while rect in paired:
+ chars += rect.char
+ new_rect = consolidate(new_rect, rect, paired[rect])
+ to_remove.add(rect)
+ rect = paired[rect]
+
+ chars += rect.char
+ if new_rect:
+ new_rect.char = chars
+ to_remove.add(rect)
+ consolidated.append(new_rect)
+
+ result = [
+ r.__dict__ for r in sorted(
+ consolidated + [rect for rect in rects if rect not in to_remove],
+ key= lambda r: (-r.top, r.left)
+ )
+ ]
+ return result
+
diff --git a/tests/tests_plugin_ocr.py b/tests/tests_plugin_ocr.py
new file mode 100644
index 00000000..94e84f98
--- /dev/null
+++ b/tests/tests_plugin_ocr.py
@@ -0,0 +1,62 @@
+# coding: utf-8
+
+# Copyright 2014-2019 Álvaro Justen
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program. If not, see .
+
+from __future__ import unicode_literals
+
+import unittest
+
+import rows
+from rows.plugins.utils_rect import join_contiguous_rects
+
+import tests.utils as utils
+
+test_data = [
+ {'char': 'R', 'left': 1282.0, 'bottom': 52.0, 'right': 1284.0, 'top': 63.0, 'page': 0.0},
+ {'char': 'S', 'left': 1284.0, 'bottom': 52.0, 'right': 1295.0, 'top': 63.0, 'page': 0.0},
+ {'char': '2', 'left': 1302.0, 'bottom': 52.0, 'right': 1303.0, 'top': 63.0, 'page': 0.0},
+ {'char': '5', 'left': 1303.0, 'bottom': 52.0, 'right': 1309.0, 'top': 63.0, 'page': 0.0},
+ {'char': '.', 'left': 1312.0, 'bottom': 53.0, 'right': 1317.0, 'top': 63.0, 'page': 0.0},
+ {'char': '0', 'left': 1319.0, 'bottom': 53.0, 'right': 1321.0, 'top': 56.0, 'page': 0.0},
+ {'char': '0', 'left': 1326.0, 'bottom': 53.0, 'right': 1334.0, 'top': 64.0, 'page': 0.0},
+ {'char': '0', 'left': 1334.0, 'bottom': 53.0, 'right': 1338.0, 'top': 64.0, 'page': 0.0},
+ {'char': ',', 'left': 1338.0, 'bottom': 53.0, 'right': 1343.0, 'top': 64.0, 'page': 0.0},
+ {'char': '0', 'left': 1344.0, 'bottom': 51.0, 'right': 1347.0, 'top': 56.0, 'page': 0.0},
+ {'char': '0', 'left': 1352.0, 'bottom': 53.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0},
+]
+
+
+class PluginOcrTestCase(utils.RowsTestMixIn, unittest.TestCase):
+
+ plugin_name = "ocr"
+ file_extension = "png"
+ filename = "tests/data/all-field-types.png"
+
+ def test_imports(self):
+ self.assertIs(rows.import_from_image, rows.plugins.ocr.import_from_image)
+
+ def basic_test(self):
+ table = rows.import_from_image(self.filename)
+ # TODO: assert
+
+
+class TestRectUtils(unittest.TestCase):
+
+ def test_join_contiguous_rects(self):
+ self.assertEquals(
+ join_contiguous_rects(test_data, 10),
+ [{'char': 'RS25.000,00', 'left': 1282.0, 'bottom': 51.0, 'right': 1362.0, 'top': 64.0, 'page': 0.0}]
+ )