Source code for docp_loaders.loaders.chromapptxloader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
:Purpose:   This module provides the entry point for loading PPTX files
            into a Chroma database.

:Platform:  Linux/Windows | Python 3.10+
:Developer: J Berendt
:Email:     development@s3dev.uk

:Comments:  n/a

:Examples:

    Parse and load a *single* PPTX file into a Chroma database
    collection::

        >>> from docp_loaders import ChromaPPTXLoader

        >>> l = ChromaPPTXLoader(path='/path/to/chroma',
                                 collection='spam',
                                 split_text=False)
        >>> l.load(path='/path/to/directory/myfile.pptx')


    Parse and load a *directory* of PPTX files into a Chroma database
    collection::

        >>> from docp_loaders import ChromaPPTXLoader

        >>> l = ChromaPPTXLoader(path='/path/to/chroma',
                                 collection='spam',
                                 split_text=False)
        >>> l.load(path='/path/to/directory', ext='pptx')


    For further example code use, please refer to the
    :class:`ChromaPPTXLoader` class docstring.

"""

import os
from docp_core.utilities import utilities
# locals
try:
    from ..objects._chromabasepptxloader import _ChromaBasePPTXLoader
except ImportError:
    from docp_loaders.objects._chromabasepptxloader import _ChromaBasePPTXLoader



[docs]
class ChromaPPTXLoader(_ChromaBasePPTXLoader):
    """Chroma database PPTX-specific document loader.

    Args:
        path (str | ChromaDB): Either the full path to the Chroma
            database *directory*, or an instance of a
            :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
            passed, the ``collection`` argument is ignored.
        collection (str, optional): Name of the Chroma database
            collection. Only required if the ``db`` parameter is a path.
            Defaults to None.
        split_text (bool, optional): Split the document into chunks,
            before loading it into the database. Defaults to True.
        offline (bool, optional): Remain offline and use the locally
            cached embedding function model. Defaults to False.

    .. tip::

        It is recommended to pass ``split_text=False`` into the
        :class:`ChromaPPTXLoader` constructor.

        Often, PowerPoint presentations are structured such that related
        text is found in the same 'shape' (textbox) on a slide.
        Splitting the text in these shapes may have undesired results.

    :Examples:

        Parse and load a *single* PPTX file into a Chroma database
        collection::

            >>> from docp_loaders import ChromaPPTXLoader

            >>> l = ChromaPPTXLoader(path='/path/to/chroma',
                                     collection='spam',
                                     split_text=False)  # <-- Note this
            >>> l.load(path='/path/to/directory/myfile.pptx')


        Parse and load a *directory* of PPTX files into a Chroma database
        collection::

            >>> from docp_loaders import ChromaPPTXLoader

            >>> l = ChromaPPTXLoader(path='/path/to/chroma',
                                     collection='spam',
                                     split_text=False)  # <-- Note this
            >>> l.load(path='/path/to/directory', ext='pptx')

    """

[docs]
    def load(self,
             path: str,
             *,
             ext: str='**',
             recursive: bool=True,
             remove_newlines: bool=True,
             convert_to_ascii: bool=True,
             **kwargs) -> None:
        """Load a PPTX file (or files) into a Chroma database.

        Args:
            path (str): Full path to the file (or *directory*) to be
                parsed and loaded. Note: If this is a directory, a
                specific file extension can be passed into the
                :meth:`load` method using the ``ext`` argument.
            ext (str, optional): If the ``path`` argument refers to a
                *directory*, a specific file extension can be specified
                here. For example: ``ext = 'pptx'``.

                If anything other than ``'**'`` is provided, all
                alpha-characters are parsed from the string, and prefixed
                with ``*.``. Meaning, if ``'.pptx'`` is passed, the
                characters ``'pptx'`` are parsed and prefixed with ``*.``
                to create ``'*.pptx'``. However, if ``'things.foo'`` is
                passed, the derived extension will be ``'*.thingsfoo'``.
                Defaults to '**', for a recursive search.

            recursive (bool, optional): If True, subdirectories are
                searched. Defaults to True.
            remove_newlines (bool, optional): Replace newline characters
                with a space. Defaults to True, as this helps with
                document chunk splitting.
            convert_to_ascii (bool, optional): Convert all characters to
                ASCII. Defaults to True.

        :Keyword Args:
            kwargs (dict): Additional keywords to be passed into the
            document parser(s).

        """
        # Prepare the arguments being sent to the doc parser.
        # - This has been done manually as using locals() introduces
        #   unpredictable behaviour.
        _kwargs = {
                   'remove_newlines': remove_newlines,
                   'convert_to_ascii': convert_to_ascii,
                   **kwargs
                  }
        # Load multi
        if os.path.isdir(path):
            files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
            count = len(files)
            for idx, f in enumerate(files, 1):
                print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
                self._load(path=f, **_kwargs)
        # Load single
        else:
            print(f'Processing: {os.path.basename(path)} ...')
            self._load(path=path, **_kwargs)