Source code for hocr.searching
import json
from xml.etree import ElementTree
from .util import open_if_required
from .parse import hocr_page_iterator, hocr_page_to_word_data_fast
from .text import hocr_get_xml_page_offsets, hocr_get_plaintext_page_offsets, \
hocr_page_text_from_word_data, get_paragraph_hocr_words
[docs]def hocr_get_page_lookup_table(fd_or_path):
"""
Create lookup table for a given hOCR document. This allows for quickly
jumping to specific XML pages.
Args:
* fd_or_path: file descriptor or filepath to the hOCR file
Returns:
Lookup table (list of a list) with each list entry:
* [text_start_byte, text_end_byte, xml_start_byte, xml_end_byte]
"""
text_ranges = hocr_get_plaintext_page_offsets(fd_or_path)
xml_ranges = hocr_get_xml_page_offsets(fd_or_path)
if len(text_ranges) != len(xml_ranges):
# Perhaps use something other than ValueError
raise ValueError('text_ranges and xml_ranges do not match')
res = []
for text, xml in zip(text_ranges, xml_ranges):
res.append((text[0], text[1], xml[0], xml[1]))
return res
[docs]def hocr_lookup_by_plaintext_offset(page_lookup_data, pos_bytes_plain):
"""
Get the lookup index and data for a page that corresponds to the plaintext
offset as
specified in pos_bytes_plain.
Args:
* page_lookup_data: Lookup table as returned by hocr_load_lookup_table or
hocr_get_page_lookup_table.
* pos_bytes_plain: Offset in plaintext of the hOCR file.
"""
for idx, dat in enumerate(page_lookup_data):
tstart, tend = dat[0:2]
xstart, xend = dat[2:4]
if tstart <= pos_bytes_plain < tend:
return idx, dat
return None, None
[docs]def hocr_lookup_page_by_dat(fp, dat):
"""
Get the XML for a specific hOCR page that corresponds to the lookup
data `dat`.
Args:
* fp: file pointer to hOCR file
* `dat`: lookup table entry for the page
"""
xstart, xend = dat[2:4]
fp.seek(xstart)
xml = fp.read(xend-xstart)
root = ElementTree.fromstring(xml)
return root
[docs]def hocr_lookup_page_by_plaintext_offset(fp, page_lookup_data, pos_bytes_plain):
"""
Get the XML for a specific hOCR page that corresponds to the plaintext
offset as specified in pos_bytes_plain.
Args:
* fp: file pointer to hOCR file
* page_lookup_data: Lookup table as returned by hocr_load_lookup_table or
hocr_get_page_lookup_table.
* pos_bytes_plain: Offset in plaintext of the hOCR file.
"""
_, dat = hocr_lookup_by_plaintext_offset(page_lookup_data, pos_bytes_plain)
return hocr_lookup_page_by_dat(fp, dat)
[docs]def hocr_load_lookup_table(path):
"""
Load lookup table from JSON.
Args:
* fd_or_path: File to load from
Returns:
* Lookup table
"""
fp = open_if_required(path)
return json.loads(fp.read().decode('utf-8'))
[docs]def hocr_save_lookup_table(lookup_table, fd_or_path):
"""
Save lookup table to JSON.
Args:
* lookup_table: Lookup table as returned by hocr_get_page_lookup_table
* fd_or_path: File to save to
"""
if isinstance(fd_or_path, str):
fd_or_path = open(fd_or_path, 'w+')
json.dump(lookup_table, fd_or_path)
[docs]def hocr_get_fts_text(fd_or_path):
"""
Return text that can be ingested in a full text search engine like SOLR or
Elastic.
Args:
* fd_or_path: File descriptor or path to hOCR file
Returns:
Repeatedly yields a tuple of (``str``, ``list of int``),
page text and a list of word confidences on the page.
"""
for page in hocr_page_iterator(fd_or_path):
word_data = hocr_page_to_word_data_fast(page)
page_text = hocr_page_text_from_word_data(word_data)
confs = []
for paragraph in word_data:
confs += [x['confidence'] for x in get_paragraph_hocr_words(paragraph)]
yield page_text, confs