Source code for hocr.text
import xml.parsers.expat
from .util import open_if_required
from .parse import hocr_page_to_word_data_fast, hocr_page_iterator
[docs]def hocr_paragraph_text(paragraph):
"""
Reconstruct text that matches the FTS text from a hOCR paragraph.
Returns a tuple, first item in the tuple is the text, the second is a
boolean, indicating if this paragraph is to be merged into the next one, see
hocr_paragraphs for more information.
Args:
* paragraph: hOCR paragraph as returned by hocr_paragraphs
Returns:
* Tuple of (`str`, `bool`), where the `str` is the paragraph data, and the
boolean if this text continues is to be merged with the next paragraph.
"""
word_confidences = []
par_text = ''
for line in paragraph['lines']:
line_words = ''
for word in line['words']:
line_words += word['text'] + ' '
# Encode
line_words = line_words.encode('utf-8')
#line_words = line_words.strip().encode('utf-8')
# Write out
if line_words:
par_text += line_words.decode('utf-8')
if par_text:
# Strip last space
par_text = par_text[:-1]
return par_text
[docs]def hocr_page_text_from_word_data(word_data):
"""
Extract text from a pre-parsed hOCR page
Args:
* word_data: as returned by ``hocr_page_to_word_data`` or
``hocr_page_to_word_data_fast``
Returns: page contents (`str`)
"""
text = ''
for paragraph in word_data:
par_text = hocr_paragraph_text(paragraph)
# Newline is something we add, it is not part of the paragraph text
par_text += '\n'
text += par_text
return text
[docs]def hocr_page_text(page):
"""
Extract text from a hOCR XML page element.
Args:
* page: hOCR XML page element
Returns: page contents (`str`)
"""
word_data = hocr_page_to_word_data_fast(page)
return hocr_page_text_from_word_data(word_data)
[docs]def get_paragraph_hocr_words(paragraph):
"""
Find all the words in a hOCR paragraph.
Args:
* hOCR paragraph as returned by hocr_paragraphs.
Returns a `list` of hocr words in a hocr paragraph.
For this to be usable for matching purposes, only run this on merged hocr
paragraphs as returned by hocr_paragraphs.
"""
words = []
for line in paragraph['lines']:
for word in line['words']:
words.append(word)
return words
# Finds bytes where pages start, and the final body segment to denote the end of
# the last page.
class PageFinder:
def __init__(self, current_parser):
self.parser = current_parser
self.parser.StartElementHandler = self.start_element
self.parser.EndElementHandler = self.end_element
self.page_bytes = []
def start_element(self, name, attrs):
if name == 'div' and 'class' in attrs and attrs['class'] == 'ocr_page':
self.page_bytes.append(self.parser.CurrentByteIndex)
def end_element(self, name):
if name == 'body':
self.page_bytes.append(self.parser.CurrentByteIndex)
[docs]def hocr_get_xml_page_offsets(fd_or_path):
"""
Builds a list of start and end bytes for each ocr_page element in the XML
file. This can be used to construct a "lookup" table, together with
hocr_get_plaintext_page_offsets.
Args:
* fd_or_path: hOCR file to operate on, or a path (str).
Return a list of tuples (start_byte, end_byte) for each ocr_page element in
a hOCR file. The start and ends bytes point to the position of the page
element in the XML file.
"""
xml_file = open_if_required(fd_or_path)
xml_file.seek(0)
p = xml.parsers.expat.ParserCreate()
h = PageFinder(p)
p.ParseFile(xml_file)
page_boundaries = list(zip(h.page_bytes[:-1], h.page_bytes[1:]))
return page_boundaries
[docs]def hocr_get_plaintext_page_offsets(fd_or_path):
"""
Builds a list of start and end bytes for each ocr_page in the plain text
file. That is, if the plain text were generated from a hOCR XML file, which
plaintext is part of which ocr_page element, and where does that text start
and end. This can be used to construct a "lookup" table, together with
hocr_get_xml_page_offsets.
Args:
* fd_or_path: hOCR file to operate on, or a path (str).
Return a list of tuples (start_byte, end_byte) for each ocr_page element in
a hOCR file. The start and ends bytes point to the position of the text as
extracted from the page in the XML file.
"""
page_it = hocr_page_iterator(fd_or_path)
page_bytes = []
cursor = 0
page_bytes.append(0)
for page in page_it:
page_text = hocr_page_text(page)
cursor += len(page_text)
page_bytes.append(cursor)
page_bytes = list(zip(page_bytes[:-1], page_bytes[1:]))
return page_bytes