Source code for hocr.text

import xml.parsers.expat

from .util import open_if_required
from .parse import hocr_page_to_word_data_fast, hocr_page_iterator


[docs]def hocr_paragraph_text(paragraph):
    """
    Reconstruct text that matches the FTS text from a hOCR paragraph.
    Returns a tuple, first item in the tuple is the text, the second is a
    boolean, indicating if this paragraph is to be merged into the next one, see
    hocr_paragraphs for more information.

    Args:

    * paragraph: hOCR paragraph as returned by hocr_paragraphs

    Returns:

    * Tuple of (`str`, `bool`), where the `str` is the paragraph data, and the
      boolean if this text continues is to be merged with the next paragraph.
    """
    word_confidences = []
    par_text = ''

    for line in paragraph['lines']:
        line_words = ''
        for word in line['words']:
            line_words += word['text'] + ' '

        # Encode
        line_words = line_words.encode('utf-8')
        #line_words = line_words.strip().encode('utf-8')

        # Write out
        if line_words:
            par_text += line_words.decode('utf-8')

    if par_text:
        # Strip last space
        par_text = par_text[:-1]

    return par_text


[docs]def hocr_page_text_from_word_data(word_data):
    """
    Extract text from a pre-parsed hOCR page

    Args:

    * word_data: as returned by ``hocr_page_to_word_data`` or
      ``hocr_page_to_word_data_fast``

    Returns: page contents (`str`)
    """
    text = ''

    for paragraph in word_data:
        par_text = hocr_paragraph_text(paragraph)

        # Newline is something we add, it is not part of the paragraph text
        par_text += '\n'

        text += par_text

    return text


[docs]def hocr_page_text(page):
    """
    Extract text from a hOCR XML page element.

    Args:

    * page: hOCR XML page element

    Returns: page contents (`str`)
    """
    word_data = hocr_page_to_word_data_fast(page)
    return hocr_page_text_from_word_data(word_data)


[docs]def get_paragraph_hocr_words(paragraph):
    """
    Find all the words in a hOCR paragraph.

    Args:

    * hOCR paragraph as returned by hocr_paragraphs.

    Returns a `list` of hocr words in a hocr paragraph.
    For this to be usable for matching purposes, only run this on merged hocr
    paragraphs as returned by hocr_paragraphs.
    """
    words = []
    for line in paragraph['lines']:
        for word in line['words']:
            words.append(word)

    return words


# Finds bytes where pages start, and the final body segment to denote the end of
# the last page.
class PageFinder:
    def __init__(self, current_parser):
        self.parser = current_parser
        self.parser.StartElementHandler = self.start_element
        self.parser.EndElementHandler = self.end_element

        self.page_bytes = []

    def start_element(self, name, attrs):
        if name == 'div' and 'class' in attrs and attrs['class'] == 'ocr_page':
            self.page_bytes.append(self.parser.CurrentByteIndex)

    def end_element(self, name):
        if name == 'body':
            self.page_bytes.append(self.parser.CurrentByteIndex)


[docs]def hocr_get_xml_page_offsets(fd_or_path):
    """
    Builds a list of start and end bytes for each ocr_page element in the XML
    file.  This can be used to construct a "lookup" table, together with
    hocr_get_plaintext_page_offsets.

    Args:

    * fd_or_path: hOCR file to operate on, or a path (str).

    Return a list of tuples (start_byte, end_byte) for each ocr_page element in
    a hOCR file. The start and ends bytes point to the position of the page
    element in the XML file.
    """
    xml_file = open_if_required(fd_or_path)
    xml_file.seek(0)

    p = xml.parsers.expat.ParserCreate()
    h = PageFinder(p)
    p.ParseFile(xml_file)

    page_boundaries = list(zip(h.page_bytes[:-1], h.page_bytes[1:]))

    return page_boundaries


[docs]def hocr_get_plaintext_page_offsets(fd_or_path):
    """
    Builds a list of start and end bytes for each ocr_page in the plain text
    file. That is, if the plain text were generated from a hOCR XML file, which
    plaintext is part of which ocr_page element, and where does that text start
    and end. This can be used to construct a "lookup" table, together with
    hocr_get_xml_page_offsets.

    Args:

    * fd_or_path: hOCR file to operate on, or a path (str).

    Return a list of tuples (start_byte, end_byte) for each ocr_page element in
    a hOCR file. The start and ends bytes point to the position of the text as
    extracted from the page in the XML file.
    """
    page_it = hocr_page_iterator(fd_or_path)

    page_bytes = []
    cursor = 0

    page_bytes.append(0)

    for page in page_it:
        page_text = hocr_page_text(page)
        cursor += len(page_text)
        page_bytes.append(cursor)

    page_bytes = list(zip(page_bytes[:-1], page_bytes[1:]))

    return page_bytes