Source code for hocr.parse

import gzip
import re

from .util import open_if_required, iterparse_tags, HOCR_SCHEMA


WRITING_DIRECTION_UNSPECIFIED = 0
WRITING_DIRECTION_LEFT_TO_RIGHT = 1
WRITING_DIRECTION_RIGHT_TO_LEFT = 2
WRITING_DIRECTION_TOP_TO_BOTTOM = 3

wdmap = {
    'ltr': WRITING_DIRECTION_LEFT_TO_RIGHT,
    'rtl': WRITING_DIRECTION_RIGHT_TO_LEFT,
}

BBOX_REGEX = re.compile(r'bbox((\s+-?\d+){4})')
PPI_REGEX = re.compile(r'scan_res((\s+\d+){2})')
BASELINE_REGEX = re.compile(r'baseline((\s+[\d\.\-]+){2})')
X_WCONF_REGEX = re.compile(r'x_wconf((\s+[\d\.\-]+){1})')
X_FSIZE_REGEX = re.compile(r'x_fsize((\s+[\d\.\-]+){1})')


[docs]def hocr_page_iterator(fd_or_path): """ Returns an iterator to iterate over a (potentially large) hOCR XML file in a streaming manner. Args: * fd_or_path: open file to operate on, or a path (str). Returns: * Iterator returning a ElementTree.Element hOCR page. """ fp = open_if_required(fd_or_path) # Seek to start fp.seek(0) tags = {HOCR_SCHEMA + 'div', 'div'} doc = iterparse_tags(fp, tag=tags) for act, elem in doc: if elem.attrib['class'] == 'ocr_page': page = elem yield page elem.clear()
[docs]def hocr_page_get_dimensions(hocr_page): """ Returns the dimensions (width, height) of a hocr page as returned by hocr_page_iterator. Args: * hocr_page: a page as returned by hocr_page Returns: * (width, height): tuple of (int, int) """ pagebox = BBOX_REGEX.search(hocr_page.attrib['title']).group(1).split() width, height = int(pagebox[2]), int(pagebox[3]) return width, height
[docs]def hocr_page_get_scan_res(hocr_page): """ Returns the X and Y resolution (in DPI) of a hocr page as returned by hocr_page_iterator. Args: * hocr_page: a page as returned by hocr_page Returns: * (x_res, y_res): tuple of (int, int) Or (None, None) if the scan_res property is not present. """ pageppi = PPI_REGEX.search(hocr_page.attrib['title']) if pageppi: pageppi = pageppi.group(1).split() x_res, y_res = int(pageppi[0]), int(pageppi[1]) return (x_res, y_res) else: return (None, None)
# XXX: Maybe get rid of scaler here, and just move the normalisation of the # x_fsize to pdfrenderer.py
[docs]def hocr_page_to_word_data(hocr_page, scaler=1): """ Parses a single hocr_page into word data. Args: * hocr_page: a single hocr_page as returned by hocr_page_iterator * (optional) scaler: a scalar to scale font sizes by Returns: A list of paragraphs, each paragraph containing a list of lines, and each line containing a list of words, plus properties. Paragraphs have the following attributes: * `'lines'`: the lines that form this paragraph Lines have the following attributes: * `'words'`: the words that form this line * `'bbox'`: bounding box (tuple of 4 floats) * `'baseline'`: baseline of the word (tuple of 2 floats) Words have the following attributes: * `'text'`: word text, str * `'bbox'`: bounding box (tuple of 4 floats) * `'fontsize'`: fontsize as a float, or 0. * `'writing_direction'`: See WRITING_DIRECTION_* constants * `'confidence'`: word confidence, 0 - 100 """ paragraphs = [] for par in hocr_page.findall('.//*[@class="ocrx_block"]') + hocr_page.findall('.//*[@class="ocr_par"]'): paragraph_data = {'lines': []} paragraph_writing_direction = WRITING_DIRECTION_UNSPECIFIED if 'dir' in par.attrib: paragraph_writing_direction = wdmap[par.attrib['dir']] # We assume that the direct children are all the lines for line in list(par): line_data = {} linebox = BBOX_REGEX.search(line.attrib['title']).group(1).split() baseline = BASELINE_REGEX.search(line.attrib['title']) if baseline is not None: baseline = baseline.group(1).split() else: baseline = [0, 0] linebox = [float(i) for i in linebox] baseline = [float(i) for i in baseline] line_data['bbox'] = linebox line_data['baseline'] = baseline word_data = [] for word in line.findall('.//*[@class="ocrx_word"]'): rawtext = '' wordbased = True for char in word.findall('.//*[@class="ocrx_cinfo"]'): rawtext += char.text wordbased = False if wordbased: wword = word # Words may contains additional nodes like <em> while True: children = list(wword) if len(children) == 0: break if len(children) > 1: raise ValueError('Not character based but word has multiple children?') wword = children[0] rawtext = wword.text if wword.text is None: raise ValueError('Word with no text value?') box = BBOX_REGEX.search(word.attrib['title']).group(1).split() box = [float(i) for i in box] conf = None m = X_WCONF_REGEX.search(word.attrib['title']) if m: conf = int(m.group(1).split()[0]) f_sizeraw = X_FSIZE_REGEX.search(word.attrib['title']) if f_sizeraw: x_fsize = float(f_sizeraw.group(1)) x_fsize *= scaler else: x_fsize = 0. # Will get fixed later on, in pdfrenderer at least writing_direction = WRITING_DIRECTION_UNSPECIFIED if 'dir' in word.attrib: writing_direction = wdmap[word.attrib['dir']] else: writing_direction = paragraph_writing_direction word_data.append({'bbox': box, 'text': rawtext, 'fontsize': x_fsize, 'writing_direction': writing_direction, 'confidence': conf}) line_data['words'] = word_data #print('Line words:', word_data) paragraph_data['lines'].append(line_data) paragraphs.append(paragraph_data) return paragraphs
[docs]def hocr_page_to_photo_data(hocr_page, minimum_page_area_pct=10): """ Parses a single hocr_page into photo data. Args: * hocr_page: a single hocr_page as returned by hocr_page_iterator * (optional) minimum_page_area_pct: a minimum percentage of the page area the picture should inhabit Returns: A list of bounding boxes where photos were found """ # Get the actual boxes from the page photo_boxes = [] for photo in hocr_page.findall('.//*[@class="ocr_photo"]'): box = BBOX_REGEX.search(photo.attrib['title']).group(1).split() box = [float(i) for i in box] photo_boxes.append(box) # Helper function to determine if there are nested boxes def box_contains_box(box_a, box_b): return box_a[0] <= box_b[0] and box_a[1] <= box_b[1] \ and box_a[2] >= box_b[2] and box_a[3] >= box_b[3] # Clean up the box data a bit cleaned_photo_boxes = list(photo_boxes) dim = hocr_page_get_dimensions(hocr_page) area_page = dim[0]*dim[1] for box_a in photo_boxes: # Image must cover at least minimum_page_area_pct of page width, height = box_a[2]-box_a[0], box_a[3]-box_a[1] area_box = width*height if area_box < area_page*(minimum_page_area_pct/100.): try: cleaned_photo_boxes.remove(box_a) #print("Box %s is too small, removing" % (box_a)) except: # Already removed pass # Nested boxes are redundant for box_b in photo_boxes: if box_a == box_b: continue if box_contains_box(box_a, box_b): try: cleaned_photo_boxes.remove(box_b) #print("Box %s is fully inside box %s, removing" % (box_b, box_a)) except: # Already removed pass return cleaned_photo_boxes
def get_title_attrs(title): # Assume Tesseract generated hOCR, where every ';' has a space after it sub_title = title.split('; ') box = None conf = None for subt in sub_title: if subt[0:7] == 'x_wconf': conf = int(subt[8:]) continue if subt[0:4] == 'bbox': # TODO: use int()? #box = [float(i) for i in subt[i + 5:].split()] box = [int(i) for i in subt[5:].split()] continue return box, conf
[docs]def hocr_page_to_word_data_fast(hocr_page): """ Parses a single hocr_page into word data. Args: * hocr_page: a single hocr_page as returned by hocr_page_iterator Returns: A list of paragraph, each paragraph containing a list of lines, and each line containing a list of words, plus properties. Paragraphs have the following attributes: * `'lines'`: the lines that form this paragraph Lines have the following attributes: * `'words'`: the words that form this line Words have the following attributes: * `'text'`: word text, str * `'bbox'`: bounding box (tuple of 4 floats) * `'confidence'`: word confidence, 0 - 100 """ paragraphs = [] has_ocrx_cinfo = 0 for par in hocr_page.findall('.//*[@class="ocr_par"]') + hocr_page.findall('.//*[@class="ocrx_block"]'): paragraph_data = {'lines': []} # We assume that the direct children are all the lines for line in list(par): line_data = {} word_data = [] for word in line.findall('.//*[@class="ocrx_word"]'): title = word.attrib['title'] box, conf = get_title_attrs(title) rawtext = '' wordbased = True if has_ocrx_cinfo < 2: for char in word.findall('.//*[@class="ocrx_cinfo"]'): rawtext += char.text wordbased = False has_ocrx_cinfo = 1 if has_ocrx_cinfo == 0: has_ocrx_cinfo = 2 if wordbased: # Words may contains additional nodes like <em> while True: children = list(word) if len(children) == 0: break if len(children) > 1: raise ValueError('Not character based but word has multiple children?') word = children[0] rawtext = word.text if word.text is None: raise ValueError('Word with no text value?') word_data.append({'bbox': box, 'text': rawtext, 'confidence': conf}) line_data['words'] = word_data paragraph_data['lines'].append(line_data) paragraphs.append(paragraph_data) return paragraphs