# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
__all__ = ("PdfTextPage", "PdfTextSearcher")
import ctypes
import logging
import warnings
import pypdfium2.raw as pdfium_c
import pypdfium2.internal as pdfium_i
from pypdfium2._helpers.misc import PdfiumError
c_double = ctypes.c_double
logger = logging.getLogger(__name__)
[docs]
class PdfTextPage (pdfium_i.AutoCloseable):
"""
Text page helper class.
Attributes:
raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
page (PdfPage): Reference to the page this textpage belongs to.
"""
def __init__(self, raw, page):
self.raw = raw
self.page = page
super().__init__(pdfium_c.FPDFText_ClosePage)
@property
def parent(self): # AutoCloseable hook
return self.page
def _get_active_text_range(self, c_start, c_end, l_passive=0, r_passive=0):
if c_start > c_end:
return 0 # no active chars in range
t_start = pdfium_c.FPDFText_GetTextIndexFromCharIndex(self, c_start)
if t_start == -1:
return self._get_active_text_range(c_start+1, c_end, l_passive+1, r_passive)
t_end = pdfium_c.FPDFText_GetTextIndexFromCharIndex(self, c_end)
if t_end == -1:
return self._get_active_text_range(c_start, c_end-1, l_passive, r_passive+1)
return t_start, t_end, l_passive, r_passive
[docs]
def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False):
"""
Warning:
.. versionchanged:: 4.28
Unexpected upstream changes have caused allocation size concerns with this API.
Using it is now discouraged unless you specifically need to extract a character range. Prefer :meth:`.get_text_bounded` where possible.
Calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
Extract text from a given range.
Parameters:
index (int): Index of the first char to include.
count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
Returns:
str: The text in the range in question, or an empty string if no text was found.
Note:
* The returned text's length does not have to match *count*, even if it will for most PDFs.
This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
* In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
"""
# https://github.com/pypdfium2-team/pypdfium2/issues/298
# https://crbug.com/pdfium/2133
if (index, count) == (0, -1) and not force_this:
warnings.warn("get_text_range() call with default params will be implicitly redirected to get_text_bounded()")
return self.get_text_bounded(errors=errors)
if count == -1:
count = self.count_chars() - index
# https://github.com/pypdfium2-team/pypdfium2/issues/261
# https://crbug.com/pdfium/2079
active_range = self._get_active_text_range(index, index+count-1)
if active_range == 0:
return ""
# NOTE since we have converted indices from char to text, they will shift accordingly for inserted/excluded chars, so this will calculate the exact output count
t_start, t_end, l_passive, r_passive = active_range
index += l_passive
count -= l_passive + r_passive
in_count = (t_end+1 - t_start)*2 + 1
buffer = ctypes.create_string_buffer(in_count * 2)
buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
out_count = pdfium_c.FPDFText_GetText(self, index, count, buffer_ptr)
assert in_count >= out_count, f"Buffer too small: {in_count} vs {out_count}"
return buffer.raw[:(out_count-1)*2].decode("utf-16-le", errors=errors)
[docs]
def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):
"""
Extract text from given boundaries in PDF coordinates.
If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
Parameters:
errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
Returns:
str: The text on the page area in question, or an empty string if no text was found.
"""
bbox = self.page.get_bbox()
if left is None:
left = bbox[0]
if bottom is None:
bottom = bbox[1]
if right is None:
right = bbox[2]
if top is None:
top = bbox[3]
args = (self, left, top, right, bottom)
n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0)
if n_chars <= 0:
return ""
buffer = ctypes.create_string_buffer(n_chars * 2)
buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars)
return buffer.raw.decode("utf-16-le", errors=errors)
[docs]
def count_chars(self):
"""
Returns:
int: The number of characters on the text page.
"""
n_chars = pdfium_c.FPDFText_CountChars(self)
if n_chars == -1:
raise PdfiumError("Failed to get character count.")
return n_chars
[docs]
def count_rects(self, index=0, count=-1):
"""
Parameters:
index (int): Start character index.
count (int): Character count to consider (defaults to -1 for all remaining).
Returns:
int: The number of text rectangles in the given character range.
"""
n_rects = pdfium_c.FPDFText_CountRects(self, index, count)
if n_rects == -1:
raise PdfiumError("Failed to count rectangles.")
return n_rects
[docs]
def get_index(self, x, y, x_tol, y_tol):
"""
Get the index of a character by position.
Parameters:
x (float): Horizontal position (in PDF canvas units).
y (float): Vertical position.
x_tol (float): Horizontal tolerance.
y_tol (float): Vertical tolerance.
Returns:
int | None: The index of the character at or nearby the point (x, y).
May be None if there is no character or an error occurred.
"""
index = pdfium_c.FPDFText_GetCharIndexAtPos(self, x, y, x_tol, y_tol)
if index < 0:
return None
return index
[docs]
def get_charbox(self, index, loose=False):
"""
Get the bounding box of a single character.
Parameters:
index (int):
Index of the character to work with, in the page's character array.
loose (bool):
Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
Returns:
Float values for left, bottom, right and top in PDF canvas units.
"""
if loose:
rect = pdfium_c.FS_RECTF()
ok = pdfium_c.FPDFText_GetLooseCharBox(self, index, rect)
l, b, r, t = rect.left, rect.bottom, rect.right, rect.top
else:
l, b, r, t = c_double(), c_double(), c_double(), c_double()
ok = pdfium_c.FPDFText_GetCharBox(self, index, l, r, b, t) # yes, lrbt!
l, b, r, t = l.value, b.value, r.value, t.value
if not ok:
raise PdfiumError("Failed to get charbox.")
return l, b, r, t
[docs]
def get_rect(self, index):
"""
Get the bounding box of a text rectangle at the given index.
Note that :meth:`.count_rects` must be called once with default parameters
before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
Returns:
Float values for left, bottom, right and top in PDF canvas units.
"""
l, b, r, t = c_double(), c_double(), c_double(), c_double()
ok = pdfium_c.FPDFText_GetRect(self, index, l, t, r, b) # yes, ltrb!
if not ok:
raise PdfiumError("Failed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.)")
return (l.value, b.value, r.value, t.value)
[docs]
def search(self, text, index=0, match_case=False, match_whole_word=False, consecutive=False):
"""
Locate text on the page.
Parameters:
text (str):
The string to search for.
index (int):
Character index at which to start searching.
match_case (bool):
If True, the search will be case-specific (upper and lower letters treated as different characters).
match_whole_word (bool):
If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
consecutive (bool):
If False (the default), :meth:`.search` will skip past the current match to look for the next match.
If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
Returns:
PdfTextSearcher: A helper object to search text.
"""
if len(text) == 0:
raise ValueError("Text length must be greater than 0.")
flags = 0
if match_case:
flags |= pdfium_c.FPDF_MATCHCASE
if match_whole_word:
flags |= pdfium_c.FPDF_MATCHWHOLEWORD
if consecutive:
flags |= pdfium_c.FPDF_CONSECUTIVE
enc_text = (text + "\x00").encode("utf-16-le")
enc_text_ptr = ctypes.cast(enc_text, ctypes.POINTER(ctypes.c_ushort))
raw_searcher = pdfium_c.FPDFText_FindStart(self, enc_text_ptr, flags, index)
searcher = PdfTextSearcher(raw_searcher, self)
self._add_kid(searcher)
return searcher
[docs]
class PdfTextSearcher (pdfium_i.AutoCloseable):
"""
Text searcher helper class.
Attributes:
raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
"""
def __init__(self, raw, textpage):
self.raw = raw
self.textpage = textpage
super().__init__(pdfium_c.FPDFText_FindClose)
@property
def parent(self): # AutoCloseable hook
return self.textpage
def _get_occurrence(self, find_func):
ok = find_func(self)
if not ok:
return None
index = pdfium_c.FPDFText_GetSchResultIndex(self)
count = pdfium_c.FPDFText_GetSchCount(self)
return index, count
[docs]
def get_next(self):
"""
Returns:
(int, int): Start character index and count of the next occurrence,
or None if the last occurrence was passed.
"""
return self._get_occurrence(pdfium_c.FPDFText_FindNext)
[docs]
def get_prev(self):
"""
Returns:
(int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
or None if the last occurrence was passed.
"""
return self._get_occurrence(pdfium_c.FPDFText_FindPrev)