blog/makesite.py

#!/usr/bin/env python3

"""Make static website/blog with Python."""

import datetime
import os
import re
import shutil
import sys
import unicodedata
from pathlib import Path

from dotenv import load_dotenv
import requests
import mistune
from pygments import highlight
from pygments.lexers import get_lexer_by_name
from pygments.formatters import html


FRENCH_WEEKDAYS = ['lun.', 'mar.', 'mer.', 'jeu.', 'ven.', 'sam.', 'dim.']
FRENCH_MONTHS = ['janv.', 'févr.', 'mars', 'avr.', 'mai', 'juin',
                 'juil.', 'août', 'sept.', 'oct.', 'nov.', 'déc.']


class HighlightRenderer(mistune.HTMLRenderer):
    """Custom Mistune renderer that adds syntax highlighting to code blocks using Pygments."""

    def block_code(self, code, info=None):
        """Render code blocks with syntax highlighting.

        Args:
            code: The code content to render
            info: Optional language identifier for syntax highlighting

        Returns:
            str: HTML with syntax-highlighted code or plain pre/code tags
        """
        if info:
            lexer = get_lexer_by_name(info, stripall=True)
            formatter = html.HtmlFormatter()
            return highlight(code, lexer, formatter)
        return '<pre><code>' + mistune.escape(code) + '</code></pre>'


markdown = mistune.create_markdown(renderer=HighlightRenderer())


def fread(filename):
    """Read file and close the file."""
    with open(filename, "r") as f:
        return f.read()


def fwrite(filename, text):
    """Write content to file and close the file."""
    basedir = os.path.dirname(filename)
    if not os.path.isdir(basedir):
        os.makedirs(basedir)

    with open(filename, "w") as f:
        f.write(text)


def log(msg, *log_args):
    """Log message with specified arguments."""
    sys.stderr.write(msg.format(*log_args) + "\n")


def _strip_tags_and_truncate(text, words=25):
    """Remove HTML tags and truncate text to the specified number of words."""
    return " ".join(re.sub(r"(?s)<.*?>", " ", text).split()[:words])


def _parse_headers(text):
    """Parse HTML comment headers and yield (key, value, end-index) tuples."""
    for match in re.finditer(r"\s*<!--\s*(.+?)\s*:\s*(.+?)\s*-->\s*|.+", text):
        if not match.group(1):
            break
        yield match.group(1), match.group(2), match.end()


def _rfc_2822_format(date_str):
    """Convert yyyy-mm-dd date string to RFC 2822 format date string."""
    d = datetime.datetime.strptime(date_str, "%Y-%m-%d")
    return d \
        .replace(tzinfo=datetime.timezone.utc) \
        .strftime('%a, %d %b %Y %H:%M:%S %z')


def slugify(value):
    """
    Converts to lowercase, removes non-word characters (alphanumerics and
    underscores) and converts spaces to hyphens. Also strips leading and
    trailing whitespace.
    """
    value = unicodedata.normalize("NFKD", value).encode("ascii", "ignore").decode("ascii")
    value = re.sub(r"[^\w\s-]", "", value)  # Remove non-word characters and spaces
    value = re.sub(r"\s+", "-", value)      # Replace multiple spaces with a single hyphen
    return value.lower()                    # Convert to lowercase


def parse_post_file(filename, params):
    """Parse post file: read, extract metadata, convert markdown, and generate summary."""
    # Read file content.
    text = fread(filename)

    # Read metadata and save it in a dictionary.
    date_slug = os.path.basename(filename).split(".")[0]
    match = re.search(r"^(?:(\d\d\d\d-\d\d-\d\d)-)?(.+)$", date_slug)
    content = {"date": match.group(1) or "1970-01-01", "slug": match.group(2)}

    # Read headers.
    end = 0
    for key, val, end in _parse_headers(text):
        content[key] = val

    # slugify post title
    content["slug"] = slugify(content["title"])

    # Separate content from headers.
    text = text[end:]

    # Convert Markdown content to HTML.
    if filename.endswith((".md", ".mkd", ".mkdn", ".mdown", ".markdown")):
        summary_index = text.find("<!-- more")
        if summary_index > 0:
            summary = markdown(_strip_html_tags(text[:summary_index]))
        else:
            summary = _strip_tags_and_truncate(markdown(_strip_html_tags(text)))
        clean_text = text.replace("<!-- more -->", "")
        text = markdown(clean_text)
    else:
        summary = _strip_tags_and_truncate(text)

    # Update the dictionary with content and RFC 2822 date.
    content.update(
        {
            "content": text,
            "content_rss": _make_links_absolute(params["site_url"], text),
            "rfc_2822_date": _rfc_2822_format(content["date"]),
            "summary": summary,
        }
    )

    return content


def _make_links_absolute(site_url, text):
    """Convert relative links to absolute URLs for RSS feed."""
    # TODO externalize links replacement configuration
    return text \
        .replace("src=\"/images/20", "src=\"" + site_url + "/images/20") \
        .replace("href=\"/20", "href=\"" + site_url + "/20")


def _strip_html_tags(text):
    """Remove HTML tags from text."""
    while True:
        original_text = text
        text = re.sub(r"<\w+.*?>", "", text)
        text = re.sub(r"<\/\w+>", "", text)
        if original_text == text:
            break
    return text


def render(template, **params):
    """Replace placeholders in template with values from params."""
    return re.sub(
        r"{{\s*([^}\s]+)\s*}}",
        lambda match: str(params.get(match.group(1), match.group(0))),
        template,
    )


def get_header_list_value(header_name, page_params):
    """Extract and parse a space-separated list from a header value.

    Args:
        header_name: Name of the header to extract (e.g., 'category', 'tag')
        page_params: Dict containing page parameters

    Returns:
        list: List of stripped string values from the header
    """
    header_list = []
    if header_name in page_params:
        for s in page_params[header_name].split(" "):
            if s.strip():
                header_list.append(s.strip())
    return header_list


def _render_comment(comment, comment_detail_layout):
    """Render a single comment using the comment detail layout.

    Args:
        comment: Dict with keys: author, content, date, and optional: site, avatar
        comment_detail_layout: Template string for rendering a comment

    Returns:
        str: Rendered HTML for the comment
    """
    site = comment.get("site", "")
    if site:
        site_start = '<a href="' + site + '">'
        site_end = '</a>'
    else:
        site_start = ''
        site_end = ''

    return render(
        comment_detail_layout,
        author=comment["author"],
        avatar=comment.get("avatar", ""),
        site_start=site_start,
        site_end=site_end,
        date=comment["date"],
        content=markdown(comment["content"]),
    )


def _fetch_comments(post_url, stacosys_url):
    """Fetch comments from Stacosys API for a given post URL.

    Args:
        post_url: Relative URL of the post (e.g., "2024/my-post/")
        stacosys_url: Base URL of the Stacosys comment service

    Returns:
        list: List of comment dictionaries from API response
    """
    req_url = stacosys_url + "/comments"
    query_params = dict(url="/" + post_url)
    resp = requests.get(url=req_url, params=query_params)
    return resp.json()["data"]


def _process_comments(page_params, stacosys_url, comment_layout,
                     comment_detail_layout):
    """Process comments for a post: fetch, render, and return comment data.

    Args:
        page_params: Dict containing page parameters (must have 'comment' and 'post_url')
        stacosys_url: Base URL of Stacosys service (empty string disables comments)
        comment_layout: Template for the overall comment section
        comment_detail_layout: Template for individual comments

    Returns:
        tuple: (comment_count, comments_html, comment_section_html)
            - comment_count: Number of comments (int)
            - comments_html: Rendered HTML of all comments (str)
            - comment_section_html: Complete comment section with form (str)
    """
    # Check if comments are enabled for this page
    page_comment = page_params.get("comment", "yes")
    is_page_comment_enabled = (page_comment != "no")

    # Default values when comments are disabled
    if not stacosys_url or not is_page_comment_enabled:
        return 0, "", ""

    # Fetch and render comments
    comments = _fetch_comments(page_params["post_url"], stacosys_url)
    out_comments = [
        _render_comment(comment, comment_detail_layout)
        for comment in comments
    ]
    comments_html = "".join(out_comments)

    # Render complete comment section
    temp_params = dict(page_params)
    temp_params["comments"] = comments_html
    temp_params["comment_count"] = len(comments)
    comment_section_html = render(comment_layout, **temp_params)

    return len(comments), comments_html, comment_section_html


def _get_friendly_date(date_str):
    """Convert date string to French-formatted readable date.

    Args:
        date_str: Date string in YYYY-MM-DD format

    Returns:
        str: French-formatted date (e.g., "15 janv. 2024")
    """
    dt = datetime.datetime.strptime(date_str, "%Y-%m-%d")
    french_month = FRENCH_MONTHS[dt.month - 1]
    return f"{dt.day:02d} {french_month} {dt.year}"


def _process_categories(page_params, category_layout):
    """Process categories from page params and return rendered category label.

    Args:
        page_params: Dict containing page parameters (must have 'category' key)
        category_layout: Template string for rendering individual categories

    Returns:
        tuple: (list of category strings, rendered category label HTML)
    """
    categories = get_header_list_value("category", page_params)
    out_cats = []
    for category in categories:
        out_cat = render(category_layout,
                        category=category,
                        url=slugify(category))
        out_cats.append(out_cat.strip())
    category_label = "".join(out_cats)
    return categories, category_label


def _setup_page_params(content, params):
    """Set up page parameters from content and global params.

    Args:
        content: Dict containing parsed content (must have 'date' key)
        params: Global parameters dict

    Returns:
        dict: Page parameters with date_path, friendly_date, year, post_url, etc.
    """
    page_params = dict(params, **content)
    page_params["header"] = ""
    page_params["footer"] = ""
    page_params["date_path"] = page_params["date"].replace("-", "/")
    page_params["friendly_date"] = _get_friendly_date(page_params["date"])
    page_params["year"] = page_params["date"].split("-")[0]
    page_params["post_url"] = (
        page_params["year"] + "/" + page_params["slug"] + "/"
    )
    return page_params


def make_posts(
        src, src_pattern, dst, layout, category_layout,
        comment_layout, comment_detail_layout, **params
):
    """Generate posts from posts directory."""
    items = []

    for posix_path in Path(src).glob(src_pattern):
        src_path = str(posix_path)
        content = parse_post_file(src_path, params)

        # render text / summary for basic fields
        content["content"] = render(content["content"], **params)
        content["summary"] = render(content["summary"], **params)

        # setup page parameters
        page_params = _setup_page_params(content, params)

        # process categories
        categories, category_label = _process_categories(page_params, category_layout)
        page_params["categories"] = categories
        page_params["category_label"] = category_label

        # tags
        tags = get_header_list_value("tag", page_params)
        page_params["tags"] = tags

        # comments
        comment_count, comments_html, comment_section = _process_comments(
            page_params, params.get("stacosys_url", ""),
            comment_layout, comment_detail_layout
        )
        page_params["comment_count"] = comment_count
        page_params["comments"] = comments_html
        page_params["comment"] = comment_section

        content["year"] = page_params["year"]
        content["post_url"] = page_params["post_url"]
        content["categories"] = page_params["categories"]
        content["category_label"] = page_params["category_label"]
        content["tags"] = page_params["tags"]
        content["friendly_date"] = page_params["friendly_date"]
        content["comment_count"] = page_params["comment_count"]
        items.append(content)

        dst_path = render(dst, **page_params)
        output = render(layout, **page_params)

        log("Rendering {} => {} ...", src_path, dst_path)
        fwrite(dst_path, output)

    return sorted(items, key=lambda x: x["date"], reverse=True)


def make_notes(
        src, src_pattern, dst, layout, **params
):
    """Generate notes from notes directory."""
    items = []

    for posix_path in Path(src).glob(src_pattern):
        src_path = str(posix_path)
        content = parse_post_file(src_path, params)

        # render text / summary for basic fields
        content["content"] = render(content["content"], **params)
        content["summary"] = render(content["summary"], **params)

        page_params = dict(params, **content)
        page_params["header"] = ""
        page_params["footer"] = ""
        page_params["friendly_date"] = ""
        page_params["category_label"] = ""
        page_params["post_url"] = "notes/" + page_params["slug"] + "/"

        content["post_url"] = page_params["post_url"]
        content["friendly_date"] = page_params["friendly_date"]
        content["category_label"] = page_params["category_label"]
        items.append(content)

        dst_path = render(dst, **page_params)
        output = render(layout, **page_params)

        log("Rendering {} => {} ...", src_path, dst_path)
        fwrite(dst_path, output)

    return sorted(items, key=lambda x: x["date"], reverse=True)


def make_list(
        posts, dst, list_layout, item_layout,
        header_layout, footer_layout, **params
):
    """Generate list page for a blog.

    Args:
        posts: List of post dictionaries to include in the list
        dst: Destination path for the generated HTML file
        list_layout: Template for the overall list page
        item_layout: Template for individual list items
        header_layout: Template for page header (None to skip)
        footer_layout: Template for page footer (None to skip)
        **params: Additional parameters for template rendering
    """

    # header
    if header_layout is None:
        params["header"] = ""
    else:
        header = render(header_layout, **params)
        params["header"] = header

    # footer
    if footer_layout is None:
        params["footer"] = ""
    else:
        footer = render(footer_layout, **params)
        params["footer"] = footer

    # content
    items = []
    for post in posts:
        item_params = dict(params, **post)
        if "comment_count" in item_params and item_params["comment_count"]:
            if item_params["comment_count"] == 1:
                item_params["comment_label"] = "1 commentaire"
            else:
                item_params["comment_label"] = (
                        str(item_params["comment_count"]) + " commentaires"
                )
        else:
            item_params["comment_label"] = ""
        item = render(item_layout, **item_params)
        items.append(item)
    params["content"] = "".join(items)
    dst_path = render(dst, **params)
    output = render(list_layout, **params)

    log("Rendering list => {} ...", dst_path)
    fwrite(dst_path, output)


def create_blog(page_layout, list_in_page_layout, params):
    """Create blog posts and paginated index pages.

    Args:
        page_layout: Template for individual pages
        list_in_page_layout: Template for list pages wrapped in page layout
        params: Global site parameters

    Returns:
        list: Sorted list of all post dictionaries (newest first)
    """
    banner_layout = fread("layout/banner.html")
    paging_layout = fread("layout/paging.html")
    post_layout = fread("layout/post.html")
    post_layout = render(page_layout, content=post_layout)
    comment_layout = fread("layout/comment.html")
    comment_detail_layout = fread("layout/comment-detail.html")
    category_layout = fread("layout/category.html")
    item_layout = fread("layout/item.html")

    posts = make_posts(
        "posts",
        "**/*.md",
        "_site/{{ post_url }}/index.html",
        post_layout,
        category_layout,
        comment_layout,
        comment_detail_layout,
        **params
    )

    # Create blog list pages by 10.
    page_size = 10
    chunk_posts = [
        posts[i: i + page_size]
        for i in range(0, len(posts), page_size)
    ]
    page = 1
    last_page = len(chunk_posts)
    for chunk in chunk_posts:
        params["page"] = page
        if page == last_page:
            params["next_page"] = ""
        else:
            params["next_page"] = "/page" + str(page + 1) + "/"
        if page == 1:
            params["previous_page"] = ""
            make_list(
                chunk,
                "_site/index.html",
                list_in_page_layout,
                item_layout,
                banner_layout,
                paging_layout,
                **params
            )
        else:
            params["previous_page"] = "/page" + str(page - 1) + "/"
        make_list(
            chunk,
            "_site/page" + str(page) + "/index.html",
            list_in_page_layout,
            item_layout,
            banner_layout,
            paging_layout,
            **params
        )
        page = page + 1
    return posts


def generate_categories(list_in_page_layout, item_nosummary_layout,
                        posts, params):
    """Generate category pages grouping posts by category.

    Args:
        list_in_page_layout: Template for list pages
        item_nosummary_layout: Template for list items without summaries
        posts: List of all blog posts
        params: Global site parameters
    """
    category_title_layout = fread("layout/category_title.html")
    cat_post = {}
    for post in posts:
        for cat in post["categories"]:
            if cat in cat_post:
                cat_post[cat].append(post)
            else:
                cat_post[cat] = [post]
    for cat in cat_post.keys():
        params["category"] = cat
        make_list(
            cat_post[cat],
            "_site/" + slugify(cat) + "/index.html",
            list_in_page_layout,
            item_nosummary_layout,
            category_title_layout,
            None,
            **params
        )


def generate_archives(blog_posts, list_in_page_layout, item_nosummary_layout,
                      archive_title_layout, params):
    """Generate archives page with all blog posts.

    Args:
        blog_posts: List of all blog posts
        list_in_page_layout: Template for list pages
        item_nosummary_layout: Template for list items without summaries
        archive_title_layout: Template for archive page header
        params: Global site parameters
    """
    make_list(
        blog_posts,
        "_site/archives/index.html",
        list_in_page_layout,
        item_nosummary_layout,
        archive_title_layout,
        None,
        **params
    )


def generate_notes(page_layout, archive_title_layout,
                   list_in_page_layout, params):
    """Generate notes pages and notes index.

    Args:
        page_layout: Template for individual pages
        archive_title_layout: Template for notes index header
        list_in_page_layout: Template for list pages
        params: Global site parameters
    """
    note_layout = fread("layout/note.html")
    item_note_layout = fread("layout/item_note.html")
    note_layout = render(page_layout, content=note_layout)

    notes = make_notes(
        "notes",
        "**/*.md",
        "_site/{{ post_url }}/index.html",
        note_layout,
        **params
    )

    make_list(
        notes,
        "_site/notes/index.html",
        list_in_page_layout,
        item_note_layout,
        archive_title_layout,
        None,
        **params
    )


def generate_rss_feeds(posts, params):
    """Generate RSS feeds: main feed and per-tag feeds.

    Args:
        posts: List of all blog posts
        params: Global site parameters
    """
    rss_xml = fread("layout/rss.xml")
    rss_item_xml = fread("layout/rss_item.xml")

    # Create main RSS feed for 10 last entries
    for filename in ("_site/rss.xml", "_site/index.xml"):
        make_list(
            posts[:10],
            filename,
            rss_xml,
            rss_item_xml,
            None,
            None,
            **params
        )

    # Create RSS feed by tag
    tag_post = {}
    for post in posts:
        for tag in post["tags"]:
            if tag in tag_post:
                tag_post[tag].append(post)
            else:
                tag_post[tag] = [post]
    for tag in tag_post.keys():
        params["tag"] = tag
        make_list(
            tag_post[tag],
            "_site/rss." + slugify(tag) + ".xml",
            rss_xml,
            rss_item_xml,
            None,
            None,
            **params
        )


def generate_sitemap(posts, params):
    """Generate XML sitemap for all posts.

    Args:
        posts: List of all blog posts
        params: Global site parameters
    """
    sitemap_xml = fread("layout/sitemap.xml")
    sitemap_item_xml = fread("layout/sitemap_item.xml")
    make_list(
        posts,
        "_site/sitemap.xml",
        sitemap_xml,
        sitemap_item_xml,
        None,
        None,
        **params
    )


def get_params(env_file=None):
    """Load site parameters from .env files.

    Args:
        env_file: Optional .env file to load and override .env values

    Returns:
        dict: Site parameters with defaults and loaded values
    """
    # Load .env file first
    load_dotenv(".env")

    # Load override file if specified
    if env_file:
        load_dotenv(env_file, override=True)
        log("use params from " + env_file)
    else:
        log("use params from .env")

    # Build params from environment variables
    params = {
        "title": os.getenv("TITLE", "Blog"),
        "subtitle": os.getenv("SUBTITLE", "Lorem Ipsum"),
        "author": os.getenv("AUTHOR", "Admin"),
        "site_url": os.getenv("SITE_URL", "http://localhost:8000"),
        "current_year": datetime.datetime.now().year,
        "stacosys_url": os.getenv("STACOSYS_URL", ""),
        "external_check": os.getenv("EXTERNAL_CHECK", ""),
    }

    return params


def rebuild_site_directory():
    """Remove existing _site directory and recreate from static files."""
    if os.path.isdir("_site"):
        shutil.rmtree("_site")
    shutil.copytree("static", "_site")


def main(env_file=None):
    """Main entry point for static site generation.

    Args:
        env_file: Optional .env file to override .env values
    """

    params = get_params(env_file)

    # Create a new _site directory from scratch.
    rebuild_site_directory()

    # Load layouts.
    page_layout = fread("layout/page.html")
    list_layout = fread("layout/list.html")
    list_in_page_layout = render(page_layout, content=list_layout)
    archive_title_layout = fread("layout/archives.html")
    item_nosummary_layout = fread("layout/item_nosummary.html")

    blog_posts = create_blog(page_layout, list_in_page_layout, params)

    generate_categories(list_in_page_layout, item_nosummary_layout,
                        blog_posts, params)

    generate_archives(blog_posts, list_in_page_layout, item_nosummary_layout,
                      archive_title_layout, params)

    generate_notes(page_layout, archive_title_layout,
                   list_in_page_layout, params)

    generate_rss_feeds(blog_posts, params)

    generate_sitemap(blog_posts, params)


if __name__ == "__main__":
    # Determine which env file to use
    env_file = None
    if "--local-stacosys" in sys.argv:
        env_file = ".env.local-stacosys"
    elif "--local" in sys.argv:
        env_file = ".env.local"
    main(env_file)