Source code for hocr.parse

import gzip
import re

from .util import open_if_required, iterparse_tags, HOCR_SCHEMA


WRITING_DIRECTION_UNSPECIFIED = 0
WRITING_DIRECTION_LEFT_TO_RIGHT = 1
WRITING_DIRECTION_RIGHT_TO_LEFT = 2
WRITING_DIRECTION_TOP_TO_BOTTOM = 3

wdmap = {
    'ltr': WRITING_DIRECTION_LEFT_TO_RIGHT,
    'rtl': WRITING_DIRECTION_RIGHT_TO_LEFT,
}

BBOX_REGEX = re.compile(r'bbox((\s+-?\d+){4})')
PPI_REGEX = re.compile(r'scan_res((\s+\d+){2})')
BASELINE_REGEX = re.compile(r'baseline((\s+[\d\.\-]+){2})')
X_WCONF_REGEX = re.compile(r'x_wconf((\s+[\d\.\-]+){1})')
X_FSIZE_REGEX = re.compile(r'x_fsize((\s+[\d\.\-]+){1})')


[docs]def hocr_page_iterator(fd_or_path):
    """
    Returns an iterator to iterate over a (potentially large) hOCR XML file in a
    streaming manner.

    Args:

    * fd_or_path: open file to operate on, or a path (str).

    Returns:

    * Iterator returning a ElementTree.Element hOCR page.
    """
    fp = open_if_required(fd_or_path)

    # Seek to start
    fp.seek(0)

    tags = {HOCR_SCHEMA + 'div', 'div'}
    doc = iterparse_tags(fp, tag=tags)
    for act, elem in doc:
        if elem.attrib['class'] == 'ocr_page':
            page = elem
            yield page

            elem.clear()


[docs]def hocr_page_get_dimensions(hocr_page):
    """
    Returns the dimensions (width, height) of a hocr page as returned by
    hocr_page_iterator.

    Args:

    * hocr_page: a page as returned by hocr_page

    Returns:

    * (width, height): tuple of (int, int)
    """
    pagebox = BBOX_REGEX.search(hocr_page.attrib['title']).group(1).split()
    width, height = int(pagebox[2]), int(pagebox[3])
    return width, height


[docs]def hocr_page_get_scan_res(hocr_page):
    """
    Returns the X and Y resolution (in DPI) of a hocr page as returned by
    hocr_page_iterator.

    Args:

    * hocr_page: a page as returned by hocr_page

    Returns:

    * (x_res, y_res): tuple of (int, int)

    Or (None, None) if the scan_res property is not present.
    """
    pageppi = PPI_REGEX.search(hocr_page.attrib['title'])
    if pageppi:
        pageppi = pageppi.group(1).split()
        x_res, y_res = int(pageppi[0]), int(pageppi[1])
        return (x_res, y_res)
    else:
        return (None, None)


# XXX: Maybe get rid of scaler here, and just move the normalisation of the
# x_fsize to pdfrenderer.py
[docs]def hocr_page_to_word_data(hocr_page, scaler=1):
    """
    Parses a single hocr_page into word data.

    Args:

    * hocr_page: a single hocr_page as returned by hocr_page_iterator
    * (optional) scaler: a scalar to scale font sizes by

    Returns:

    A list of paragraphs, each paragraph containing a list of lines, and each
    line containing a list of words, plus properties.

    Paragraphs have the following attributes:

    * `'lines'`: the lines that form this paragraph

    Lines have the following attributes:

    * `'words'`: the words that form this line
    * `'bbox'`: bounding box (tuple of 4 floats)
    * `'baseline'`: baseline of the word (tuple of 2 floats)

    Words have the following attributes:

    * `'text'`: word text, str
    * `'bbox'`: bounding box (tuple of 4 floats)
    * `'fontsize'`: fontsize as a float, or 0.
    * `'writing_direction'`: See WRITING_DIRECTION_* constants
    * `'confidence'`: word confidence, 0 - 100
    """
    paragraphs = []

    for par in hocr_page.findall('.//*[@class="ocrx_block"]') + hocr_page.findall('.//*[@class="ocr_par"]'):
        paragraph_data = {'lines': []}

        paragraph_writing_direction = WRITING_DIRECTION_UNSPECIFIED
        if 'dir' in par.attrib:
            paragraph_writing_direction = wdmap[par.attrib['dir']]

        # We assume that the direct children are all the lines
        for line in list(par):
            line_data = {}

            linebox = BBOX_REGEX.search(line.attrib['title']).group(1).split()
            baseline = BASELINE_REGEX.search(line.attrib['title'])
            if baseline is not None:
                baseline = baseline.group(1).split()
            else:
                baseline = [0, 0]

            linebox = [float(i) for i in linebox]
            baseline = [float(i) for i in baseline]

            line_data['bbox'] = linebox
            line_data['baseline'] = baseline

            word_data = []
            for word in line.findall('.//*[@class="ocrx_word"]'):
                rawtext = ''
                wordbased = True
                for char in word.findall('.//*[@class="ocrx_cinfo"]'):
                    rawtext += char.text
                    wordbased = False

                if wordbased:
                    wword = word
                    # Words may contains additional nodes like <em>
                    while True:
                        children = list(wword)
                        if len(children) == 0:
                            break

                        if len(children) > 1:
                            raise ValueError('Not character based but word has multiple children?')

                        wword = children[0]

                    rawtext = wword.text

                    if wword.text is None:
                        raise ValueError('Word with no text value?')

                box = BBOX_REGEX.search(word.attrib['title']).group(1).split()
                box = [float(i) for i in box]

                conf = None
                m = X_WCONF_REGEX.search(word.attrib['title'])
                if m:
                    conf = int(m.group(1).split()[0])

                f_sizeraw = X_FSIZE_REGEX.search(word.attrib['title'])
                if f_sizeraw:
                    x_fsize = float(f_sizeraw.group(1))
                    x_fsize *= scaler
                else:
                    x_fsize = 0. # Will get fixed later on, in pdfrenderer at least

                writing_direction = WRITING_DIRECTION_UNSPECIFIED
                if 'dir' in word.attrib:
                    writing_direction = wdmap[word.attrib['dir']]
                else:
                    writing_direction = paragraph_writing_direction

                word_data.append({'bbox': box, 'text': rawtext, 'fontsize':
                    x_fsize, 'writing_direction': writing_direction,
                    'confidence': conf})


            line_data['words'] = word_data
            #print('Line words:', word_data)
            paragraph_data['lines'].append(line_data)

        paragraphs.append(paragraph_data)

    return paragraphs

[docs]def hocr_page_to_photo_data(hocr_page, minimum_page_area_pct=10):
    """
    Parses a single hocr_page into photo data.

    Args:

    * hocr_page: a single hocr_page as returned by hocr_page_iterator
    * (optional) minimum_page_area_pct: a minimum percentage of the page area the picture should inhabit

    Returns:

    A list of bounding boxes where photos were found
    """

    # Get the actual boxes from the page
    photo_boxes = []
    for photo in hocr_page.findall('.//*[@class="ocr_photo"]'):
        box = BBOX_REGEX.search(photo.attrib['title']).group(1).split()
        box = [float(i) for i in box]
        photo_boxes.append(box)

    # Helper function to determine if there are nested boxes
    def box_contains_box(box_a, box_b):
        return box_a[0] <= box_b[0] and box_a[1] <= box_b[1] \
           and box_a[2] >= box_b[2] and box_a[3] >= box_b[3]

    # Clean up the box data a bit
    cleaned_photo_boxes = list(photo_boxes)
    dim = hocr_page_get_dimensions(hocr_page)
    area_page = dim[0]*dim[1]
    for box_a in photo_boxes:
        # Image must cover at least minimum_page_area_pct of page
        width, height = box_a[2]-box_a[0], box_a[3]-box_a[1]
        area_box = width*height
        if area_box < area_page*(minimum_page_area_pct/100.):
            try:
                cleaned_photo_boxes.remove(box_a)
                #print("Box %s is too small, removing" % (box_a))
            except: # Already removed
                pass

        # Nested boxes are redundant
        for box_b in photo_boxes:
            if box_a == box_b:
                continue
            if box_contains_box(box_a, box_b):
                try:
                    cleaned_photo_boxes.remove(box_b)
                    #print("Box %s is fully inside box %s, removing" % (box_b, box_a))
                except: # Already removed
                    pass

    return cleaned_photo_boxes

def get_title_attrs(title):
    # Assume Tesseract generated hOCR, where every ';' has a space after it
    sub_title = title.split('; ')
    box = None
    conf = None

    for subt in sub_title:
        if subt[0:7] == 'x_wconf':
            conf = int(subt[8:])
            continue
        if subt[0:4] == 'bbox':
            # TODO: use int()?
            #box = [float(i) for i in subt[i + 5:].split()]
            box = [int(i) for i in subt[5:].split()]
            continue

    return box, conf

[docs]def hocr_page_to_word_data_fast(hocr_page):
    """
    Parses a single hocr_page into word data.

    Args:

    * hocr_page: a single hocr_page as returned by hocr_page_iterator

    Returns:

    A list of paragraph, each paragraph containing a list of lines, and each
    line containing a list of words, plus properties.

    Paragraphs have the following attributes:

    * `'lines'`: the lines that form this paragraph

    Lines have the following attributes:

    * `'words'`: the words that form this line

    Words have the following attributes:

    * `'text'`: word text, str
    * `'bbox'`: bounding box (tuple of 4 floats)
    * `'confidence'`: word confidence, 0 - 100
    """
    paragraphs = []

    has_ocrx_cinfo = 0

    for par in hocr_page.findall('.//*[@class="ocr_par"]') + hocr_page.findall('.//*[@class="ocrx_block"]'):
        paragraph_data = {'lines': []}

        # We assume that the direct children are all the lines
        for line in list(par):
            line_data = {}

            word_data = []
            for word in line.findall('.//*[@class="ocrx_word"]'):
                title = word.attrib['title']

                box, conf = get_title_attrs(title)

                rawtext = ''
                wordbased = True
                if has_ocrx_cinfo < 2:
                    for char in word.findall('.//*[@class="ocrx_cinfo"]'):
                        rawtext += char.text
                        wordbased = False
                        has_ocrx_cinfo = 1

                if has_ocrx_cinfo == 0:
                    has_ocrx_cinfo = 2

                if wordbased:
                    # Words may contains additional nodes like <em>
                    while True:
                        children = list(word)
                        if len(children) == 0:
                            break

                        if len(children) > 1:
                            raise ValueError('Not character based but word has multiple children?')

                        word = children[0]

                    rawtext = word.text

                    if word.text is None:
                        raise ValueError('Word with no text value?')

                word_data.append({'bbox': box, 'text': rawtext,
                                  'confidence': conf})


            line_data['words'] = word_data
            paragraph_data['lines'].append(line_data)

        paragraphs.append(paragraph_data)

    return paragraphs