Source code for pypdfium2._helpers.textpage

# SPDX-FileCopyrightText: 2024 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause

__all__ = ("PdfTextPage", "PdfTextSearcher")

import ctypes
import logging
import warnings
import pypdfium2.raw as pdfium_c
import pypdfium2.internal as pdfium_i
from pypdfium2._helpers.misc import PdfiumError

c_double = ctypes.c_double

logger = logging.getLogger(__name__)



[docs]
class PdfTextPage (pdfium_i.AutoCloseable):
    """
    Text page helper class.
    
    Attributes:
        raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
        page (PdfPage): Reference to the page this textpage belongs to.
    """
    
    def __init__(self, raw, page):
        self.raw = raw
        self.page = page
        super().__init__(pdfium_c.FPDFText_ClosePage)
    
    @property
    def parent(self):  # AutoCloseable hook
        return self.page
    
    
    def _get_active_text_range(self, c_start, c_end, l_passive=0, r_passive=0):
        
        if c_start > c_end:
            return 0  # no active chars in range
        
        t_start = pdfium_c.FPDFText_GetTextIndexFromCharIndex(self, c_start)
        if t_start == -1:
            return self._get_active_text_range(c_start+1, c_end, l_passive+1, r_passive)
        
        t_end = pdfium_c.FPDFText_GetTextIndexFromCharIndex(self, c_end)
        if t_end == -1:
            return self._get_active_text_range(c_start, c_end-1, l_passive, r_passive+1)
        
        return t_start, t_end, l_passive, r_passive
    
    

[docs]
    def get_text_range(self, index=0, count=-1, errors="ignore", force_this=False):
        """
        Warning:
            .. versionchanged:: 4.28
               Unexpected upstream changes have caused allocation size concerns with this API.
               Using it is now discouraged unless you specifically need to extract a character range. Prefer :meth:`.get_text_bounded` where possible.
               Calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
        
        Extract text from a given range.
        
        Parameters:
            index (int): Index of the first char to include.
            count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
            errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text in the range in question, or an empty string if no text was found.
        
        Note:
            * The returned text's length does not have to match *count*, even if it will for most PDFs.
              This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
              This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
              Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
            * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
        """
        
        # https://github.com/pypdfium2-team/pypdfium2/issues/298
        # https://crbug.com/pdfium/2133
        if (index, count) == (0, -1) and not force_this:
            warnings.warn("get_text_range() call with default params will be implicitly redirected to get_text_bounded()")
            return self.get_text_bounded(errors=errors)
        
        if count == -1:
            count = self.count_chars() - index
        
        # https://github.com/pypdfium2-team/pypdfium2/issues/261
        # https://crbug.com/pdfium/2079
        active_range = self._get_active_text_range(index, index+count-1)
        if active_range == 0:
            return ""
        
        # NOTE since we have converted indices from char to text, they will shift accordingly for inserted/excluded chars, so this will calculate the exact output count
        t_start, t_end, l_passive, r_passive = active_range
        index += l_passive
        count -= l_passive + r_passive
        in_count = (t_end+1 - t_start)*2 + 1
        
        buffer = ctypes.create_string_buffer(in_count * 2)
        buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
        out_count = pdfium_c.FPDFText_GetText(self, index, count, buffer_ptr)
        assert in_count >= out_count, f"Buffer too small: {in_count} vs {out_count}"
        
        return buffer.raw[:(out_count-1)*2].decode("utf-16-le", errors=errors)

    
    

[docs]
    def get_text_bounded(self, left=None, bottom=None, right=None, top=None, errors="ignore"):
        """
        Extract text from given boundaries in PDF coordinates.
        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
        
        Parameters:
            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text on the page area in question, or an empty string if no text was found.
        """
        
        bbox = self.page.get_bbox()
        if left is None:
            left = bbox[0]
        if bottom is None:
            bottom = bbox[1]
        if right is None:
            right = bbox[2]
        if top is None:
            top = bbox[3]
        
        args = (self, left, top, right, bottom)
        n_chars = pdfium_c.FPDFText_GetBoundedText(*args, None, 0)
        if n_chars <= 0:
            return ""
        
        buffer = ctypes.create_string_buffer(n_chars * 2)
        buffer_ptr = ctypes.cast(buffer, ctypes.POINTER(ctypes.c_ushort))
        pdfium_c.FPDFText_GetBoundedText(*args, buffer_ptr, n_chars)
        return buffer.raw.decode("utf-16-le", errors=errors)

    
    

[docs]
    def count_chars(self):
        """
        Returns:
            int: The number of characters on the text page.
        """
        n_chars = pdfium_c.FPDFText_CountChars(self)
        if n_chars == -1:
            raise PdfiumError("Failed to get character count.")
        return n_chars

    
    

[docs]
    def count_rects(self, index=0, count=-1):
        """
        Parameters:
            index (int): Start character index.
            count (int): Character count to consider (defaults to -1 for all remaining).
        Returns:
            int: The number of text rectangles in the given character range.
        """
        n_rects = pdfium_c.FPDFText_CountRects(self, index, count)
        if n_rects == -1:
            raise PdfiumError("Failed to count rectangles.")
        return n_rects

    
    

[docs]
    def get_index(self, x, y, x_tol, y_tol):
        """
        Get the index of a character by position.
        
        Parameters:
            x (float): Horizontal position (in PDF canvas units).
            y (float): Vertical position.
            x_tol (float): Horizontal tolerance.
            y_tol (float): Vertical tolerance.
        Returns:
            int | None: The index of the character at or nearby the point (x, y).
            May be None if there is no character or an error occurred.
        """
        index = pdfium_c.FPDFText_GetCharIndexAtPos(self, x, y, x_tol, y_tol)
        if index < 0:
            return None
        return index

    
    

[docs]
    def get_charbox(self, index, loose=False):
        """
        Get the bounding box of a single character.
        
        Parameters:
            index (int):
                Index of the character to work with, in the page's character array.
            loose (bool):
                Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        """
        
        if loose:
            rect = pdfium_c.FS_RECTF()
            ok = pdfium_c.FPDFText_GetLooseCharBox(self, index, rect)
            l, b, r, t = rect.left, rect.bottom, rect.right, rect.top
        else:
            l, b, r, t = c_double(), c_double(), c_double(), c_double()
            ok = pdfium_c.FPDFText_GetCharBox(self, index, l, r, b, t)  # yes, lrbt!
            l, b, r, t = l.value, b.value, r.value, t.value
        
        if not ok:
            raise PdfiumError("Failed to get charbox.")
        
        return l, b, r, t

    
    

[docs]
    def get_rect(self, index):
        """
        Get the bounding box of a text rectangle at the given index.
        Note that :meth:`.count_rects` must be called once with default parameters
        before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
        
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        """
        l, b, r, t = c_double(), c_double(), c_double(), c_double()
        ok = pdfium_c.FPDFText_GetRect(self, index, l, t, r, b)  # yes, ltrb!
        if not ok:
            raise PdfiumError("Failed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.)")
        return (l.value, b.value, r.value, t.value)

    
    

[docs]
    def search(self, text, index=0, match_case=False, match_whole_word=False, consecutive=False):
        """
        Locate text on the page.
        
        Parameters:
            text (str):
                The string to search for.
            index (int):
                Character index at which to start searching.
            match_case (bool):
                If True, the search will be case-specific (upper and lower letters treated as different characters).
            match_whole_word (bool):
                If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
            consecutive (bool):
                If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
        Returns:
            PdfTextSearcher: A helper object to search text.
        """
        
        if len(text) == 0:
            raise ValueError("Text length must be greater than 0.")
        
        flags = 0
        if match_case:
            flags |= pdfium_c.FPDF_MATCHCASE
        if match_whole_word:
            flags |= pdfium_c.FPDF_MATCHWHOLEWORD
        if consecutive:
            flags |= pdfium_c.FPDF_CONSECUTIVE
        
        enc_text = (text + "\x00").encode("utf-16-le")
        enc_text_ptr = ctypes.cast(enc_text, ctypes.POINTER(ctypes.c_ushort))
        raw_searcher = pdfium_c.FPDFText_FindStart(self, enc_text_ptr, flags, index)
        searcher = PdfTextSearcher(raw_searcher, self)
        self._add_kid(searcher)
        return searcher





[docs]
class PdfTextSearcher (pdfium_i.AutoCloseable):
    """
    Text searcher helper class.
    
    Attributes:
        raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
        textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
    """
    
    def __init__(self, raw, textpage):
        self.raw = raw
        self.textpage = textpage
        super().__init__(pdfium_c.FPDFText_FindClose)
    
    @property
    def parent(self):  # AutoCloseable hook
        return self.textpage
    
    
    def _get_occurrence(self, find_func):
        ok = find_func(self)
        if not ok:
            return None
        index = pdfium_c.FPDFText_GetSchResultIndex(self)
        count = pdfium_c.FPDFText_GetSchCount(self)
        return index, count
    

[docs]
    def get_next(self):
        """
        Returns:
            (int, int): Start character index and count of the next occurrence,
            or None if the last occurrence was passed.
        """
        return self._get_occurrence(pdfium_c.FPDFText_FindNext)

    

[docs]
    def get_prev(self):
        """
        Returns:
            (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
            or None if the last occurrence was passed.
        """
        return self._get_occurrence(pdfium_c.FPDFText_FindPrev)