Source code for taika.ext.excerpt

"""
:mod:`taika.ext.excerpt` -- Documents excerpts
==============================================

This extensions creates a excerpt for the documents based on it's content.

Event
-----

This extension is subscribed to the "doc-post-read" event.

Frontmatter
-----------

.. data:: excerpt_separator (str)

    Use this separator instead of the global separator defined in the configuration.

Configuration
-------------

.. code-block:: yaml

    excerpt_separator: <!-- read-more -->


.. data:: excerpt_separator (str)

    Default: :code:`None`

    A string that will be used as excerpt separator. Default to :code:`None` so no excerpt
    will be generated.


Process
-------

#. Check for the frontmatter option, otherwise use the global or the default separator.
#. If separator is None, the first :code:`<p>` tag is retrieved if existent.
#. If the first :code:`<p>` tag is not found, :code:`\\n\\n` (double line separator) is
   used as separator.
#. If separator is something, the text before that separator is retrieved if existent.
#. The :code:`excerpt` is inserted into the document so it will be accessible.

Classes and Functions
---------------------
"""
import logging
import sys

try:
    import bs4
except ImportError:
    print("taika.ext.excerpt needs beautifulsoup4 to be installed.")
    sys.exit(1)

DEFAULT_SEPARATOR = None
LOGGER = logging.getLogger(__name__)


[docs]def get_excerpt(site, document):
    separator = site.config.get("excerpt_separator", DEFAULT_SEPARATOR)
    if document["url"].suffix != ".html":
        LOGGER.debug(f"Document {document['url']} doesn't finish in html.")
        return

    if "excerpt_separator" in document:
        separator = document["excerpt_separator"]

    if separator is None:
        html = bs4.BeautifulSoup(document["content"], features="html.parser")
        first_p = html.find("p")

        if first_p is not None:
            document["excerpt"] = "<p>" + first_p.get_text() + "</p>"
            LOGGER.debug(f"Extracting the first paragraph for {document['url']}.")
        else:
            LOGGER.debug(f" Didn't found <p> tag. Using \\n\\n as separator.")
            document["excerpt"] = _text_before("\n\n", document["content"])
    else:
        LOGGER.debug(f"Using separator '{separator}' for document {document['url']}.")
        document["excerpt"] = _text_before(separator, document["content"])


def _text_before(string, text):
    pos = text.find(string)
    if pos == -1:
        LOGGER.debug(f"Separator {string} not found on text.")
        LOGGER.debug(text)

    return text[:pos]


[docs]def setup(site):
    site.events.register("doc-post-read", get_excerpt)