Source code for docp_loaders.loaders.chromapdfloader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the entry point for loading PDF files
            into a Chroma database.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

:Examples:

    Parse and load a *single* PDF file into a Chroma database
    collection::

        >>> from docp_loaders import ChromaPDFLoader

        >>> l = ChromaPDFLoader(path='/path/to/chroma',
                                collection='spam')
        >>> l.load(path='/path/to/directory/myfile.pdf')


    Parse and load a *directory* of PDF files into a Chroma database
    collection::

        >>> from docp_loaders import ChromaPDFLoader

        >>> l = ChromaPDFLoader(path='/path/to/chroma',
                                collection='spam')
        >>> l.load(path='/path/to/directory', ext='pdf')


    For further example code use, please refer to the
    :class:`ChromaPDFLoader` class docstring.

"""

import os
from docp_core.utilities import utilities
# locals
try:
    from ..objects._chromabasepdfloader import _ChromaBasePDFLoader
except ImportError:
    from docp_loaders.objects._chromabasepdfloader import _ChromaBasePDFLoader



[docs]
class ChromaPDFLoader(_ChromaBasePDFLoader):
    r"""Chroma database PDF-specific document loader.

    Args:
        path (str | ChromaDB): Either the full path to the Chroma
            database *directory*, or an instance of a
            :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
            passed, the ``collection`` argument is ignored.
        collection (str, optional): Name of the Chroma database
            collection. Only required if the ``dbpath`` parameter is a
            path. Defaults to None.
        split_text (bool, optional): Split the document into chunks,
            before loading it into the database. Defaults to True.
        chunk_size (int, optional): Size (in characters) of each text
            chunk after splitting. Defaults to 512.
        chunk_overlap (int, optional): Number of characters to overlap in
            the split text.  Defaults to 128.
        separators (list, optional): Separators to be used by the
            recursive text splitter.
            Defaults to ``['\n\n\n', '\n\n', '\n', ' ']``.
        separators_md (list, optional): Separators to be used by the
            recursive text splitter, **for Markdown files**.
            Defaults to ``['#', '##', '###', '\n']``.
        embedding_model_path (str, optional): Path to the embedding model
            to be used. Defaults to None.
        repo_id (str, optional): Huggingface repository name to be used
            as the embedding model. Defaults to None.
        offline (bool, optional): Remain offline and use the locally
            cached embedding function model. Defaults to False.

    :Examples:

        Parse and load a *single* PDF file into a Chroma database
        collection::

            >>> from docp_loaders import ChromaPDFLoader

            >>> l = ChromaPDFLoader(path='/path/to/chroma',
                                    collection='spam')
            >>> l.load(path='/path/to/directory/myfile.pdf')


        Parse and load a *directory* of PDF files into a Chroma
        database collection::

            >>> from docp_loaders import ChromaPDFLoader

            >>> l = ChromaPDFLoader(path='/path/to/chroma',
                                    collection='spam')
            >>> l.load(path='/path/to/directory', ext='pdf')

    """

    #
    # No __init__ method here to ensure the ultimate base class'
    # signature is used and to save passing loads of stuff around, if we
    # don't have to.
    #


[docs]
    def load(self,
             path: str,
             *,
             ext: str='**',
             recursive: bool=True,
             load_from_markdown: bool=False,
             # docp_parsers.PDFParser.extract_text -->
             remove_header: bool=True,
             remove_footer: bool=True,
             remove_newlines: bool=True,
             ignore_tags: set=None,
             convert_to_ascii: bool=True,
             x_tolerance: int=3,
             y_tolerance: int=3,
             # docp_docling.PDFParser.to_markdown -->
             page_no: int=None,
             image_mode: str='placeholder',
             include_annotations: bool=True,
             unique_lines: bool=False,
             **kwargs) -> None:
        """Load a PDF file (or files) into a Chroma database.

        .. note::

            There are *many* argument in this method. This is because
            these arguments are passed into the document parser(s). Any
            argument which is not accepted by the target parser is simply
            ignored.

        Args:
            path (str): Full path to the file (or *directory*) to be
                parsed and loaded. Note: If this is a directory, a
                specific file extension can be passed into the
                :meth:`load` method using the ``ext`` argument.
            ext (str, optional): If the ``path`` argument refers to a
                *directory*, a specific file extension can be specified
                here. For example: ``ext = 'pdf'``. Defaults to '**',
                for a recursive search.

                .. note::

                    If anything other than ``'**'`` is provided, all
                    alpha-characters are parsed from the string, and
                    prefixed with ``*.``. Meaning, if ``'.pdf'`` is
                    passed, the characters ``'pdf'`` are parsed and
                    prefixed with ``*.`` to create ``'*.pdf'``. However,
                    if ``'things.foo'`` is passed, the derived extension
                    will be ``'*.thingsfoo'``.

            recursive (bool, optional): If True, subdirectories are
                searched. Defaults to True.
            load_from_markdown (bool, optional): Convert the PDF text to
                Markdown format and load from the Markdown text.
                Defaults to False.

                .. tip::

                    This is particularly useful when loading PDF
                    documents for use with a RAG pipeline as this method
                    of loading is designed to **keep document sections
                    together** as chunks in the database, which aids in
                    more complete content retrieval.

                    **Note:**
                    This is more processing intensive as ``docling``
                    models are required for the conversion.

            remove_header (bool, optional): Attempt to remove the header
                from each page. Defaults to True.
            remove_footer (bool, optional): Attempt to remove the footer
                from each page. Defaults to True.
            remove_newlines (bool, optional): Replace newline characters
                with a space. Defaults to True, as this helps with
                document chunk splitting.
            ignore_tags (set, optional): If provided, these are the
                PDF 'marked content' tags which will be ignored. Note
                that the PDF document must contain tags, otherwise the
                bounding box method is used and this argument is ignored.
                Defaults to ``{'Artifact'}``, as these generally
                relate to a header and/or footer. To include all tags,
                (not skip any) pass this argument as ``'na'``.
            convert_to_ascii (bool, optional): Convert all characters to
                ASCII. Defaults to True.
            x_tolerance (int, optional): Adds space where the difference
                between x1 of one character and the x0 of the next
                character is greater than x_tolerance. Defaults to 3.
            y_tolerance (int, optional): Adds space where the difference
                between y1 of one character and the y0 of the next
                character is greater than y_tolerance. Defaults to 3.
            page_no (int, optional): Page number to convert.
                Defaults to None (for all pages).
            image_mode (str, optional): The mode to use for including
                images in the markdown. Options are: 'embedded',
                'placeholder', 'referenced'. Defaults to 'placeholder'.
            include_annotations (bool, optional): Whether to include
                annotations in the export. Defaults to True.
            unique_lines (bool, optional): Remove any duplicated lines
                from the document's content. Generally used to remove
                repeated header and footer strings. Defaults to False.

        :Keyword Args:
            kwargs (dict): Additional keywords to be passed into the
            document parser(s).

        """
        # pylint: disable=too-many-locals  # A lot of arguments are needed.
        # Prepare the arguments being sent to the doc parser.
        # - This has been done manually as using locals() introduces
        #   unpredictable behaviour.
        _kwargs = {
                   # docp_parsers.PDFParser.extract_text -->
                   'remove_header': remove_header,
                   'remove_footer': remove_footer,
                   'remove_newlines': remove_newlines,
                   'ignore_tags': ignore_tags,
                   'convert_to_ascii': convert_to_ascii,
                   'x_tolerance': x_tolerance,
                   'y_tolerance': y_tolerance,
                   # docp_docling.PDFParser.to_markdown -->
                   'page_no': page_no,
                   'image_mode': image_mode,
                   'include_annotations': include_annotations,
                   'unique_lines': unique_lines,
                   **kwargs,
                  }
        self._from_md = load_from_markdown  # Set parent class' flag.
        # Load multi
        if os.path.isdir(path):
            files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
            count = len(files)
            for idx, f in enumerate(files, 1):
                print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
                self._load(path=f, **_kwargs)
        # Load single
        else:
            print(f'Processing: {os.path.basename(path)} ...')
            self._load(path=path, **_kwargs)