jimsite/jimsite.py

import os
import re
import glob
import shutil
import subprocess
import markdown
import yaml
import pydantic
from typing import Optional
from datetime import datetime, date
from dotmap import DotMap

class GlobalVars(pydantic.BaseModel):
    '''Static-valued global variables to be interpolated into any HTML templates.'''
    today: date = datetime.today()


def filepath_or_string(s: str) -> str:
    '''Loads the contents of a string if it is a filepath, otherwise returns the string.'''
    if os.path.isfile(s):
        with open(s, 'r') as f:
            return f.read()
    else:
        return s


def extract_placeholders(s: str) -> set:
    '''Extracts placeholder variables in the format `{variable}` from
    an unformatted template string.'''

    # Regex pattern to match placeholders with alphanumerics, dots, and underscores.
    placeholder_pattern = r'\{([\w\.]+)\}'

    # Find all matches in the string.
    matches = re.findall(placeholder_pattern, s)

    # Return the set of distinct placeholders.
    return set(matches)


def find_cyclical_placeholders(s: str, _parents: tuple = None, _cycles: set = None, **kwargs) -> set[tuple]:
    '''Recursively interpolates supplied kwargs into a template string to validate
    that there are no cyclical dependencies that would cause infinite recursion.

    Returns a list of paths (expressed as tuples of nodes) of cyclical placeholders.
    '''

    # Track the lineage of each placeholder so we can see if it is its own ancestor.
    if _parents is None:
        _parents = tuple()

    # Keep track of any cycles encountered.
    if _cycles is None:
        _cycles = set()

    # Extract the placeholders from the input.
    placeholders = extract_placeholders(s)

    # Recursion will naturally end once there are no more nested placeholders.
    for p in placeholders:

        # Any placeholder that has itself in its ancestry forms a cycle.
        if p in _parents:
            _cycles.add(_parents + (p,))

        # For placeholders that are not their own ancestor, recursively
        # interpolate the kwargs into the nested placeholders until we reach
        # strings without placeholders.
        else:
            find_cyclical_placeholders(
                ('{'+p+'}').format(**kwargs),
                _parents = _parents+(p,),
                _cycles = _cycles,
                **kwargs
            )

    return _cycles


with open('config.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file.read())

class SiteConfig(pydantic.BaseModel):
    base_url: Optional[str] = config['site_defaults'].get('base_url')
    git_repo: Optional[str] = config['site_defaults'].get('git_repo')
    build_cache: Optional[str] = config['site_defaults'].get('build_cache')
    assets: Optional[list] = config['site_defaults'].get('assets')
    web_root: Optional[str] = config['site_defaults'].get('web_root')
    articles: Optional[list] = config['site_defaults'].get('articles')

class ArticleMetadata(pydantic.BaseModel):
    title: str
    author: Optional[str] = config.get('author')
    date: date
    lastmod: Optional[date] = None
    published: bool
    tags: list
    thumbnail: Optional[str] = None


def load_markdown(md: str) -> tuple[ArticleMetadata|None, str]:
    '''Loads a Markdown file into a (metadata: ArticleMetadata, content: str) pair.'''

    # Load the file contents if a filepath is specified, and strip document delimiters ('---').
    md = filepath_or_string(md).strip().strip('---').strip()

    # If there is no `---` delimiter, then the article has no metadata.
    if '---' not in md.strip('---'):
        return None, md

    # Split the metadata from the contents.
    [raw_metadata, raw_article] = md.split('---')

    # Use YAML to parse the metadata.
    metadata = yaml.safe_load(raw_metadata)

    # Convert the contents to a HTML string.
    content = markdown.markdown(raw_article)

    return ArticleMetadata(**metadata), content


def format_html_template(template: str, **kwargs) -> str:
    '''Interpolates variables specified as keyword arguments
    into the given HTML template.

    # Example

    ```python
    kwargs = {'a': '1', 'b': '2', 'c': '{d}+{e}', 'd': '3', 'e': '{c}'}
    s = '{a} + {b} = {c}'
    find_cyclical_placeholders(s, **kwargs)

    >>> {('c', 'e', 'c')}
    ```
    '''

    # Load the template if a filepath is given.
    template = filepath_or_string(template)

    # Ensure the template does not have cyclical placeholder references.
    cycles = find_cyclical_placeholders(template, globalvars = GlobalVars(), **kwargs)

    if len(cycles) > 0:
        raise ValueError('Template has cyclical dependencies: {cycles}')

    # Iteratively interpolate global variables and the kwargs into the template until
    # there are no more placeholders. The loop is used to account for nested template references.
    formatted_html = template
    while len(extract_placeholders(formatted_html)) > 0:
        formatted_html = formatted_html.format(globalvars = GlobalVars(), **kwargs)

    # Return the formatted HTML.
    return formatted_html


run = lambda cmd: subprocess.run(cmd.split(' '), stdout = subprocess.PIPE, stderr = subprocess.PIPE)
def pull_git_repo(repo: str, build_cache: str) -> None:
    '''Pulls/clones a repo into the build cache directory.'''
    if os.path.exists(f'{build_cache}/.git'):
        run(f'git -C {build_cache} pull origin')
    else:
        run(f'git clone {repo} {build_cache}')


def load_partials() -> dict:
    """Loads partial templates from the templates/partials directory."""
    partials = {}
    for filename in os.listdir('templates/partials'):
        with open(f'templates/partials/{filename}') as partial_file:
            partial_template = partial_file.read()

        partials[f'partials.{os.path.splitext(filename)[0]}'] = format_html_template(
            partial_template,
            current_year = datetime.now().year
        )
    return partials


def import_resume():

    # Use a sentinel value for the loop.
    max_date = '0000-00-00'

    # Loop through the folders in the resume repo to find the most recent one.
    for resume_folder in os.listdir('build/resume'):

        # Skip folders that are not in YYYY-MM-DD format.
        try:
            datetime.strptime(resume_folder,'%Y-%m-%d')
        except Exception:
            continue

        # Keep track of the most recent date.
        if resume_folder > max_date:
            max_date = resume_folder

    # Copy the resume into the /dist directory.
    run(f'cp build/resume/{max_date}/shepich_resume.pdf dist/shepich_resume.pdf')


def format_blog_tags(tags: list[str], template = 'templates/components/blog_tag.html') -> list[str]:
    '''Generates HTML blog tag components from a list of tag names.'''
    return [
        format_html_template(template, tag_name = t) for t in tags
    ]


def build_blog_archive(
        index: dict[str, tuple[str, str]],
        page_template = 'templates/pages/default.html',
        li_template = 'templates/components/blog_archive_li.html',
        **kwargs
    ) -> str:
    '''Converts an index, formatted as filestem: (metadata, contents) dict,
    into an HTML page containing the list of articles, sorted from newest to oldest.

    Note: partials must be expanded into the kwargs, as they are needed to generate
    the overall page.
    '''

    # Add each article as a list item to an unordered list.
    archive_html_content = '<ul>'
    for article, (metadata, contents) in sorted(index.items(), key = lambda item: item[1][0].date)[::-1]:

        # Generate HTML for the article (including metadata tags).
        archive_html_content += format_html_template(
            li_template,
            article_filestem = article,
            blog_tags = ' '.join(format_blog_tags(metadata.tags)),
            metadata = metadata

        )
    archive_html_content +='</ul>'

    # Interpolate the article into the overall page template.
    archive_html_page = format_html_template(
        page_template,
        content = archive_html_content,
        **kwargs
    )

    return archive_html_page

def copy_assets(site: SiteConfig):
    '''Copies the list of site assets from the build cache to the web root.'''

    # Expand any globbed expressions.
    expanded_asset_list = []
    for a in site.assets:
        expanded_asset_list.extend(
            # Assets are defined relative to the build cache; construct the full path.
            glob.glob(f'{site.build_cache}/{a.lstrip("/")}')
        )

    for asset in expanded_asset_list:

        # Construct the destination path analogous to the source path
        # but in the web root instead of the build cache.
        destination = f'{site.web_root}/{a.lstrip("/")}'

        # Delete existing files.
        shutil.rmtree(destination, ignore_errors=True)

        # Copy the asset.
        if os.path.isdir(asset):
            shutil.copytree(asset, destination)
        elif os.path.isfile(asset):
            shutil.copyfile(asset, destination)
        else:
            continue

    return None


def build_index(site: SiteConfig) -> dict:
    '''Loads the sites articles into an index mapping the filename stem
    to a (metadata: dict, content: str) tuple.'''

    index = {}

    # Expand any globbed expressions.
    expanded_article_list = []
    for a in site.articles or {}:
        expanded_article_list.extend(
            # Article paths are defined relative to the build cache; construct the full path.
            glob.glob(f'{site.build_cache}/{a.lstrip("/")}')
        )


    for article in expanded_article_list:
        metadata, content = load_markdown(article)

        # Skip unpublished articles.
        if not metadata.published:
            continue

        article_filestem = os.path.splitext(os.path.basename(article))[0]
        index[article_filestem] = (metadata, content)

    return index


def map_templates(dir: str, parent = '') -> DotMap:
    '''Recursively maps the templates directory into a nested dict structure.
    Leaves map the filestems of .html template files to their contents.
    '''

    output = {}

    # List the files and subdirectories at the top level.
    for sub in os.listdir(os.path.join(parent,dir)):

        # Construct the full path to the file or subdir from the root of the tree.
        full_path = os.path.join(parent,dir,sub)

        # Recursively map subdirectories.
        if os.path.isdir(full_path):
            output[sub] = map_templates(sub, parent = dir)
            continue

        # Templates must be .html files.
        filestem, ext = os.path.splitext(sub)
        if ext != '.html':
            continue

        # Load template file.
        with open(full_path, 'r') as file:
            html = file.read()

        output[filestem] = html

    return DotMap(output)


if __name__ == '__main__':
    pass