diff --git a/.gitignore b/.gitignore index d0c63a6..6035ca2 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,9 @@ *.pyc *.pyo *.egg-info +hocr.egg-info +dist/ +build/ # Git files *.orig diff --git a/build/lib/hocr/__init__.py b/build/lib/hocr/__init__.py new file mode 100644 index 0000000..d878826 --- /dev/null +++ b/build/lib/hocr/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- + +__title__ = 'hocr' +__version__ = '0.2.12' +__copyright__ = 'Copyright 2017 Vic.ai' +__description__ = 'HOCR manipulation and utility library, based on @jsfenfen and @concordusapps' +__author__ = 'rune@vic.ai' +# __all__ = ['parse', ] # noqa diff --git a/build/lib/hocr/page.py b/build/lib/hocr/page.py new file mode 100644 index 0000000..cb0d486 --- /dev/null +++ b/build/lib/hocr/page.py @@ -0,0 +1,167 @@ +import re +import six + + +class Box(object): + + def __init__(self, text=None, left=0, right=0, top=0, bottom=0): + + # Parse the text string representation if given. + if text is not None: + left, top, right, bottom = map(int, text.split()) + + self.left = left + self.right = right + self.top = top + self.bottom = bottom + + @property + def width(self): + return self.right - self.left + + @property + def height(self): + return self.bottom - self.top + + def __repr__(self): + return '' % ( + self.left, self.top, self.right, self.bottom) + + +class Base(object): + + _allowed_ocr_classes = {} + _dir_methods = [] + + def __init__(self, element): # noqa + """ + @param[in] element + XML node for the OCR element. + """ + # Store the element for later reference. + self._element = element + + # Create an element cache. + self._cache = {} + + # Parse the properties of the HOCR element. + properties = element.get('title', '').split(';') + for prop in properties: + prop = prop.strip() + + if six.PY3: + name, value = prop.split(maxsplit=1) + else: + name, value = prop.split(' ', 1) + + if name == 'bbox': + self.box = Box(value) + + elif name == 'image': + self.image = value.strip('" ') + + elif name == 'x_wconf': + self.wconf = int(value) + + elif name == 'textangle': + self.textangle = int(value) + if value == '90': + self.vertical = True + + elif name == 'x_size': + self.size = value + + elif name == 'x_ascenders': + self.ascenders = float(value) + + elif name == 'x_descenders': + self.descenders = float(value) + + elif name == 'ppageno': + self.ppageno = int(value) + + def __dir__(self): + + if six.PY3: + return super().__dir__() + list(self._allowed_ocr_classes) + else: + return list(self._allowed_ocr_classes) + getattr(self, '_dir_methods', []) + return super( + Base, self).__dir__() + list(self._allowed_ocr_classes) + + def __getattr__(self, name): + # Return the cached version if present. + if name in self._cache: + return self._cache[name] + + # Parse the named OCR elements. + if name in self._allowed_ocr_classes: + ref = OCR_CLASSES[name] + nodes = self._element.find_all(class_=re.compile(ref['name'])) + self._cache[name] = elements = list(map(ref['class'], nodes)) + return elements + + # Attribute is not present. + raise AttributeError(name) + + +class Word(Base): + + _allowed_ocr_classes = {} + _dir_methods = ['box', 'bold', 'italic', 'lang', 'wconf'] + + def __init__(self, element): + # Initialize the base. + if six.PY3: + super().__init__(element) + else: + super(Word, self).__init__(element) + + # Discover if we are "bold". + # A word element is bold if its text node is wrapped in a . + self.bold = bool(element.find('strong')) + + # Discover if we are "italic". + # A word element is italic if its text node is wrapped in a . + self.italic = bool(element.find('em')) + + # Find the text node. + self.text = element.text + + self.lang = element.get("lang", '') + + def __str__(self): + return '' % (self.text, self.box) + + +class Line(Base): + _allowed_ocr_classes = {'words'} + _dir_methods = ['box', 'text', 'vertical', 'textangle'] + vertical = False + textangle = 0 + + @property + def text(self): + return ' '.join([w.text for w in self.words]) + + +class Paragraph(Base): + _allowed_ocr_classes = {'lines', 'words'} + + +class Block(Base): + _allowed_ocr_classes = {'paragraphs', 'lines', 'words'} + _dir_methods = ['box', ] + + +class Page(Base): + _allowed_ocr_classes = {'blocks', 'paragraphs', 'lines', 'words'} + _dir_methods = ['image', ] + + +OCR_CLASSES = { + 'words': {'name': 'ocr.?_word', 'class': Word}, + 'lines': {'name': 'ocr_line', 'class': Line}, + 'paragraphs': {'name': 'ocr_par', 'class': Paragraph}, + 'blocks': {'name': 'ocr_carea', 'class': Block} +} diff --git a/build/lib/hocr/parser.py b/build/lib/hocr/parser.py new file mode 100644 index 0000000..5e361b2 --- /dev/null +++ b/build/lib/hocr/parser.py @@ -0,0 +1,29 @@ +import six +import re +from bs4 import UnicodeDammit, BeautifulSoup +from .page import Page + +kill_html_closing_tags = re.compile(r'\<\/\s*html', re.I) + + +def parse(source): + """Parse a HOCR stream into page elements. + @param[in] source + Either a file-like object or a filename of the HOCR text. + """ + # Coerce the source into content. + if isinstance(source, six.string_types): + with open(source, 'rb') as stream: + content = stream.read() + + else: + content = source.read() + + # Parse the HOCR xml stream. + ud = UnicodeDammit(content, is_html=True) + + # will take a while for a 500 page document + soup = BeautifulSoup(ud.unicode_markup, 'lxml') + + # Get all the pages and parse them into page elements. + return [Page(x) for x in soup.find_all(class_='ocr_page')] diff --git a/dist/hocr-0.2.12-py2.7.egg b/dist/hocr-0.2.12-py2.7.egg new file mode 100644 index 0000000..89083bb Binary files /dev/null and b/dist/hocr-0.2.12-py2.7.egg differ diff --git a/hocr/__init__.py b/hocr/__init__.py index e69de29..d878826 100644 --- a/hocr/__init__.py +++ b/hocr/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- + +__title__ = 'hocr' +__version__ = '0.2.12' +__copyright__ = 'Copyright 2017 Vic.ai' +__description__ = 'HOCR manipulation and utility library, based on @jsfenfen and @concordusapps' +__author__ = 'rune@vic.ai' +# __all__ = ['parse', ] # noqa diff --git a/hocr/page.py b/hocr/page.py index 6f701f7..f30fa31 100644 --- a/hocr/page.py +++ b/hocr/page.py @@ -31,13 +31,13 @@ def __repr__(self): class Base(object): _allowed_ocr_classes = {} + _dir_methods = [] - def __init__(self, element): + def __init__(self, element): # noqa """ @param[in] element XML node for the OCR element. """ - # Store the element for later reference. self._element = element @@ -60,11 +60,35 @@ def __init__(self, element): elif name == 'image': self.image = value.strip('" ') + elif name == 'x_wconf': + self.wconf = int(value) + + elif name == 'textangle': + self.textangle = int(value) + if value == '90': + self.vertical = True + + elif name == 'x_size': + self.size = value + + elif name == 'x_ascenders': + self.ascenders = float(value) + + elif name == 'x_descenders': + self.descenders = float(value) + + elif name == 'ppageno': + self.ppageno = int(value) + + if element.get('lang', None): + self.lang = element.get('lang', None) + def __dir__(self): if six.PY3: return super().__dir__() + list(self._allowed_ocr_classes) else: + return list(self._allowed_ocr_classes) + getattr(self, '_dir_methods', []) return super( Base, self).__dir__() + list(self._allowed_ocr_classes) @@ -87,6 +111,7 @@ def __getattr__(self, name): class Word(Base): _allowed_ocr_classes = {} + _dir_methods = ['box', 'bold', 'italic', 'lang', 'wconf'] def __init__(self, element): # Initialize the base. @@ -106,7 +131,7 @@ def __init__(self, element): # Find the text node. self.text = element.text - self.lang = element.get("lang",'') + self.lang = element.get("lang", '') def __str__(self): return '' % (self.text, self.box) @@ -114,18 +139,28 @@ def __str__(self): class Line(Base): _allowed_ocr_classes = {'words'} + _dir_methods = ['box', 'text', 'vertical', 'textangle'] + vertical = False + textangle = 0 + + @property + def text(self): + return ' '.join([w.text for w in self.words]) class Paragraph(Base): _allowed_ocr_classes = {'lines', 'words'} + _dir_methods = ['lang', ] class Block(Base): _allowed_ocr_classes = {'paragraphs', 'lines', 'words'} + _dir_methods = ['box', ] class Page(Base): _allowed_ocr_classes = {'blocks', 'paragraphs', 'lines', 'words'} + _dir_methods = ['image', ] OCR_CLASSES = { diff --git a/requirements_dev.txt b/requirements.txt similarity index 100% rename from requirements_dev.txt rename to requirements.txt diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4b586ae --- /dev/null +++ b/setup.py @@ -0,0 +1,29 @@ +# +# Copyright 2017 Vic.ai - Rune Loyning +# +# https://github.com/Vic-ai/python-hocr +# + +from setuptools import find_packages +from distutils.core import setup +from pkgutil import get_importer + +meta = get_importer('hocr').find_module('__init__').load_module('__init__') + +setup( + name="hocr", + version=meta.__version__, + description=meta.__description__, + author='Vic.ai', + author_email='rune@vic.ai', + url='http://github.com/loyning/python-24so/', + keywords='hocr', + classifiers=[], + packages=find_packages(), + include_package_data=True, + install_requires=[ + 'beautifulsoup4', + 'six', + 'lxml' + ], +)