Source code for hocr.searching

import json

from xml.etree import ElementTree

from .util import open_if_required
from .parse import hocr_page_iterator, hocr_page_to_word_data_fast
from .text import hocr_get_xml_page_offsets, hocr_get_plaintext_page_offsets, \
        hocr_page_text_from_word_data, get_paragraph_hocr_words


[docs]def hocr_get_page_lookup_table(fd_or_path): """ Create lookup table for a given hOCR document. This allows for quickly jumping to specific XML pages. Args: * fd_or_path: file descriptor or filepath to the hOCR file Returns: Lookup table (list of a list) with each list entry: * [text_start_byte, text_end_byte, xml_start_byte, xml_end_byte] """ text_ranges = hocr_get_plaintext_page_offsets(fd_or_path) xml_ranges = hocr_get_xml_page_offsets(fd_or_path) if len(text_ranges) != len(xml_ranges): # Perhaps use something other than ValueError raise ValueError('text_ranges and xml_ranges do not match') res = [] for text, xml in zip(text_ranges, xml_ranges): res.append((text[0], text[1], xml[0], xml[1])) return res
[docs]def hocr_lookup_by_plaintext_offset(page_lookup_data, pos_bytes_plain): """ Get the lookup index and data for a page that corresponds to the plaintext offset as specified in pos_bytes_plain. Args: * page_lookup_data: Lookup table as returned by hocr_load_lookup_table or hocr_get_page_lookup_table. * pos_bytes_plain: Offset in plaintext of the hOCR file. """ for idx, dat in enumerate(page_lookup_data): tstart, tend = dat[0:2] xstart, xend = dat[2:4] if tstart <= pos_bytes_plain < tend: return idx, dat return None, None
[docs]def hocr_lookup_page_by_dat(fp, dat): """ Get the XML for a specific hOCR page that corresponds to the lookup data `dat`. Args: * fp: file pointer to hOCR file * `dat`: lookup table entry for the page """ xstart, xend = dat[2:4] fp.seek(xstart) xml = fp.read(xend-xstart) root = ElementTree.fromstring(xml) return root
[docs]def hocr_lookup_page_by_plaintext_offset(fp, page_lookup_data, pos_bytes_plain): """ Get the XML for a specific hOCR page that corresponds to the plaintext offset as specified in pos_bytes_plain. Args: * fp: file pointer to hOCR file * page_lookup_data: Lookup table as returned by hocr_load_lookup_table or hocr_get_page_lookup_table. * pos_bytes_plain: Offset in plaintext of the hOCR file. """ _, dat = hocr_lookup_by_plaintext_offset(page_lookup_data, pos_bytes_plain) return hocr_lookup_page_by_dat(fp, dat)
[docs]def hocr_load_lookup_table(path): """ Load lookup table from JSON. Args: * fd_or_path: File to load from Returns: * Lookup table """ fp = open_if_required(path) return json.loads(fp.read().decode('utf-8'))
[docs]def hocr_save_lookup_table(lookup_table, fd_or_path): """ Save lookup table to JSON. Args: * lookup_table: Lookup table as returned by hocr_get_page_lookup_table * fd_or_path: File to save to """ if isinstance(fd_or_path, str): fd_or_path = open(fd_or_path, 'w+') json.dump(lookup_table, fd_or_path)
[docs]def hocr_get_fts_text(fd_or_path): """ Return text that can be ingested in a full text search engine like SOLR or Elastic. Args: * fd_or_path: File descriptor or path to hOCR file Returns: Repeatedly yields a tuple of (``str``, ``list of int``), page text and a list of word confidences on the page. """ for page in hocr_page_iterator(fd_or_path): word_data = hocr_page_to_word_data_fast(page) page_text = hocr_page_text_from_word_data(word_data) confs = [] for paragraph in word_data: confs += [x['confidence'] for x in get_paragraph_hocr_words(paragraph)] yield page_text, confs