wip

2025-09-21 19:43:16 +02:00
parent f6b9ff6a78
commit 80e30294db
9 changed files with 2340 additions and 0 deletions
--- a/Karen.md
+++ b/Karen.md
--- a/README.md
+++ b/README.md
@@ -1,2 +1,10 @@
 # markepub
 normalize-md
 inplace, take markdown and 
 - remove double empty lines
 - add line break after every full-stop
--- a/markepub/init.py
+++ b/markepub/init.py
--- a/markepub/de_meta.py
+++ b/markepub/de_meta.py
@@ -0,0 +1,156 @@
 from collections.abc import Mapping, Sequence
 import pandoc
 from lxml import etree
 from lxml.builder import E
 # noinspection PyUnresolvedReferences
 from pandoc.types import Meta, MetaMap, MetaList, MetaBool, MetaString, MetaInlines, MetaBlocks, Str, Emph, Underline, \
    Strong, Strikeout, Superscript, Subscript, SmallCaps, Quoted, Cite, Code, Space, SoftBreak, LineBreak, Math
 from util import get_xhtml_template
 def resolve_inline(value):
    if isinstance(value, Str): return value[0]
    # if isinstance(value, Emph): return value
    # if isinstance(value, Underline): return value
    # if isinstance(value, Strong): return value
    # if isinstance(value, Strikeout): return value
    # if isinstance(value, Superscript): return value
    # if isinstance(value, Subscript): return value
    # if isinstance(value, SmallCaps): return value
    # if isinstance(value, Quoted): return value
    # if isinstance(value, Cite): return value
    # if isinstance(value, Code): return value
    if isinstance(value, Space): return ' '
    if isinstance(value, SoftBreak): return ' '
    if isinstance(value, LineBreak): return '\n'
    # if isinstance(value, Math): return value
    return value
 def resolve_meta_value(value):
    if isinstance(value, MetaMap): return PyMetaMap(value)
    if isinstance(value, MetaList): return PyMetaList(value)
    if isinstance(value, (MetaBool, MetaString)): return value[0]
    if isinstance(value, MetaInlines): return PyMetaInlines(value)
    if isinstance(value, MetaBlocks): return PyMetaBlocks(value)
    return value
 class _Sequence(Sequence):
    def __getitem__(self, index: int):
        return self._data[index]
    def __len__(self) -> int:
        return len(self._data)
    def __init__(self, p) -> None:
        self._data = [v for v in p[0]]
 class PyMetaBlocks(_Sequence): pass
 class PyMetaInlines(_Sequence): pass
 # noinspection PyMissingConstructor
 class PyMetaList(_Sequence):
    def __init__(self, p) -> None:
        self._data = [resolve_meta_value(v) for v in p[0]]
 class PyMetaMap(Mapping):
    def __getitem__(self, key: str, /):
        return self._data[key]
    def __len__(self) -> int:
        return len(self._data)
    def __iter__(self):
        return iter(self._data)
    def __init__(self, pandoc_meta):
        self._data = {k: resolve_meta_value(v) for k, v in self._pandoc[0].items()}
 if __name__ == '__main__':
    data = Meta({
        'creator': MetaList([
            MetaMap({
                'file-as': MetaInlines([
                    Str('Riter,'),
                    Space(),
                    Str('E.Z.')]),
                'role': MetaInlines([
                    Str('aut')]),
                'text': MetaInlines([
                    Str('E.Z.'),
                    Space(),
                    Str('Riter')])})]),
        'description': MetaInlines([
            Str('Karen'),
            Space(),
            Str('meets'),
            Space(),
            Str('the'),
            Space(),
            Str('man'),
            Space(),
            Str('she'),
            Space(),
            Str('can’t'),
            Space(),
            Str('resist'),
            Space(),
            Str('—'),
            Space(),
            Str('the'),
            Space(),
            Str('man'),
            Space(),
            Str('who'),
            Space(),
            Str('can'),
            Space(),
            Str('do'),
            Space(),
            Str('anything'),
            Space(),
            Str('to'),
            Space(),
            Str('her,'),
            Space(),
            Str('and'),
            Space(),
            Str('she'),
            Space(),
            Str('will'),
            Space(),
            Str('love'),
            Space(),
            Str('it'),
            Space(),
            Str('—'),
            Space(),
            Str('and'),
            Space(),
            Str('him.')]),
        'language': MetaInlines([Str('en')]),
        'published': MetaInlines([Str('2002-07-24')]),
        'source': MetaInlines([Str('https://www.bdsmlibrary.info/stories/story.php?storyid=1101')]),
        'subject': MetaList([
            MetaInlines([Str('M/f')]),
            MetaInlines([Str('pregnant')]),
            MetaInlines([Str('spanking')]),
            MetaInlines([Str('D/s')]),
            MetaInlines([Str('real')]),
            MetaInlines([Str('reluctant')]),
            MetaInlines([Str('Serious')])]),
        'title': MetaMap({
            'file-as': MetaInlines([Str('Karen')]),
            'text': MetaInlines([Str('Karen')])})})
    m = PyMetaMap(data)
    print(m)
--- a/markepub/frontmatter.py
+++ b/markepub/frontmatter.py
@@ -0,0 +1,215 @@
 import datetime
 import uuid
 from lxml import etree
 from lxml.builder import E
 import yaml
 from markepub.util import get_xhtml_template
 # DublinCore Elements:
 # - contributor
 # - coverage
 # + creator
 # + date
 # + description
 # - format
 # + identifier
 # + language
 # + publisher
 # - relation
 # - rights
 # + source
 # + subject
 # + title
 # - type
 # Calibre
 # + series
 # + series_index
 DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
 OPF_NAMESPACE = "http://www.idpf.org/2007/opf"
 DC = f'{{{DC_NAMESPACE}}}'
 OPF = f'{{{OPF_NAMESPACE}}}'
 class _Scalar:
    CLASS = ''
    TAG = DC + CLASS
    def __init__(self, value):
        self.value = value.strip() if isinstance(value, str) else value
    def __str__(self):
        return str(self.value)
    def __repr__(self):
        return self.value
    @property
    def element(self):
        e = etree.Element(self.TAG)
        e.text = self.value
        return e
    @property
    def as_title_page(self):
        return E.tr(E.td(self.value, **{'class': self.CLASS}))
 class _HasFileAs(_Scalar):
    def __init__(self, value: str, file_as: str = None):
        super().__init__(value=value)
        self.file_as = file_as.strip()
    @classmethod
    def from_yaml(cls, value: str | dict[str, str]):
        if isinstance(value, str):
            return cls(value=value)
        return cls(value=value.get('text', value['value']), file_as=value.get('file-as', None))
    @property
    def element(self):
        e = super().element
        if self.file_as:
            e.set(OPF + 'file-as', self.file_as)
        return e
 class Author(_HasFileAs):
    CLASS = 'author'
    TAG = DC + 'creator'
    @property
    def element(self):
        e = super().element
        e.set(OPF + 'role', 'aut')
        return e
 class Title(_HasFileAs):
    CLASS = 'title'
 class _Date(_Scalar):
    TAG = DC + 'date'
 class Published(_Date):
    @property
    def element(self):
        e = super().element
        e.set(OPF + 'event', 'publication')
        return e
 class Modified(_Date):
    def __init__(self):
        super().__init__(value=None)
    @property
    def element(self):
        e = super().element
        e.text = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d %H-%M-%S')
        e.set(OPF + 'event', 'modification')
        return e
 class Description(_Scalar):
    CLASS = 'description'
 class Identifier(_Scalar):
    CLASS = 'identifier'
    def __init__(self, value: str = None):
        if value is None:
            value =  uuid.uuid4().urn
        super().__init__(value=value)
    @property
    def element(self):
        e = super().element
        e.set('id', 'BookId')
        e.set(OPF + 'scheme', 'UUID')
        return e
 class Language(_Scalar):
    CLASS = 'language'
 class Publisher(_Scalar):
    CLASS = 'publisher'
 class Source(_Scalar):
    CLASS = 'source'
 class Subject(_Scalar):
    CLASS = 'Subject'
 class _Calibre(_Scalar):
    def element(self):
        return etree.Element('meta', name=f'calibre:{self.TAG}', content=self.value)
 class Series(_Calibre):
    TAG = 'series'
 class Index(_Scalar):
    TAG = 'series_index'
 class Frontmatter:
    creators: list[Author] = None
    description: Description = None
    identifier: Identifier = None
    index: Index = None
    language: Language = None
    published: Published = None
    publisher: Publisher = None
    series: Series = None
    source: Source = None
    subjects: list[Subject] = None
    title: Title = None
    def __init__(self, **kwargs):
        self.creators = [kwargs['creator']] if 'creator' in kwargs else None
        self.subjects = [kwargs['subject']] if 'subject' in kwargs else None
        self.__dict__.update(kwargs)
    @property
    def as_title_page(self):
        doc = get_xhtml_template()
        root = doc.getroot()
        root.append(
            E.head(
                E.title(self.title),
                E.link(href="../Styles/title-page.css", type="text/css", rel="stylesheet"),
            )
        )
        table_body = E.tbody()
        root.append(E.body(E.table(table_body), **{'class': 'title-page'}))
        if self.creators:
            items = [str(self.creators[0])]
            for i in range(1, len(self.creators)):
                items.extend((E.br(), self.creators[i]))
            table_body.append(E.tr(E.td(*items), **{'class': 'author'}))
        table_body.append(self.title.as_title_page)
        if self.series:
            items = [str(self.series)]
            if self.index:
                items.extend((E.br(), str(self.index)))
            table_body.append(E.tr(E.td(*items), **{'class': 'sub-title'}))
        if self.description:
            table_body.append(self.description.as_title_page)
        return doc
--- a/markepub/normalize-md.py
+++ b/markepub/normalize-md.py
@@ -0,0 +1,49 @@
 import re
 def normalize_markdown(text: list[str]):
    front_matter = []
    if text[0] == "---":
        text.pop(0)
        while text:
            line = text.pop(0)
            if line == "---":
                break
            front_matter.append(line)
    main_matter = []
    buffer = []
    while text:
        line = text.pop(0)
        if not line:
            if len(buffer) > 0:
                main_matter.append(' '.join(buffer))
                buffer.clear()
        else:
            buffer.append(line)
    if len(buffer) > 0:
        main_matter.append(' '.join(buffer))
    SPACES = re.compile(r' +')
    ENDS = re.compile(r'([!?.])\s')
    ENDS2 = re.compile(r'([!?.])”\s')
    # normalize spaces...
    for i in range(len(main_matter)):
        main_matter[i] = SPACES.sub(' ', main_matter[i])
        main_matter[i] = ENDS.sub(r'\1\n', main_matter[i])
        main_matter[i] = ENDS2.sub(r'\1”\n', main_matter[i])
    result = '---\n' + '\n'.join(front_matter) + '\n---\n\n' if front_matter else ''
    return result + '\n\n'.join(main_matter)
 if __name__ == '__main__':
    with open('Karen.md', encoding='utf-8') as f:
        result = normalize_markdown([l.rstrip() for l in f.readlines()])
    with open('Karen.md', 'w', encoding='utf-8') as f:
        f.write(result)
--- a/markepub/title_page.py
+++ b/markepub/title_page.py
@@ -0,0 +1,165 @@
 from lxml.builder import E
 from lxml import etree
 import pandoc
 from pandoc.types import *
 from util import get_xhtml_template
 def class_(*args):  # class is a reserved word in Python
    return {"class": ' '.join(args)}
 def make_title_page(pandoc_meta: Meta) -> etree.ElementTree:
    meta_dict = pandoc_meta[0]
    items = []
    if 'creator' in meta_dict:
        # either scarlar (used as value)
        # or a List
        author = None
        if isinstance(meta_dict['creator'], MetaInlines):
            author = pandoc.write(meta_dict['creator'])
        elif isinstance(meta_dict['creator'], MetaList):
            for i in meta_dict['creator'][0]:
                if 'role' in i[0] and i[0]['role'] != 'aut':
                    continue
                if 'text' in i[0]:
                    author = pandoc.write(i[0]['text'])
        if author:
            items.append(E.tr(E.td(author.strip(), class_('author'))))
    title = ''
    if 'title' in meta_dict:
        if isinstance(meta_dict['title'], MetaInlines):
            title = pandoc.write(meta_dict['title'])
        elif isinstance(meta_dict['title'], MetaMap):
            if 'text' in meta_dict['title'][0]:
                title = pandoc.write(meta_dict['title'][0]['text'])
        if title:
            items.append(E.tr(E.td(title.strip(), class_('title'))))
    if 'sub-title' in meta_dict:
        sub_title = None
        if isinstance(meta_dict['sub-title'], MetaInlines):
            sub_title = pandoc.write(meta_dict['sub-title'])
        elif isinstance(meta_dict['sub-title'], MetaMap):
            if 'text' in meta_dict['sub-title'][0]:
                sub_title = pandoc.write(meta_dict['sub-title'][0]['text'])
        if sub_title:
            items.append(E.tr(E.td(sub_title.strip(), class_('sub-title'))))
    if 'description' in meta_dict:
        description = None
        if isinstance(meta_dict['description'], MetaInlines):
            description = pandoc.write(meta_dict['description'],
                                       format='html5',
                                       options=["--wrap=none"]
                                       )
        if description:
            items.append(E.tr(E.td(description.strip(), class_('description'))))
    doc = get_xhtml_template()
    root = doc.getroot()
    root.append(
        E.head(
            E.title(title.strip()),
            E.link(href="../Styles/title-page.css", type = "text/css", rel = "stylesheet"),
        )
    )
    if items:
        root.append(
            E.body(
                E.table(
                    E.tbody(*items),
                )
            )
        )
    return doc
 def prettyprint(element, **kwargs):
    xml = etree.tostring(element, pretty_print=True, **kwargs)
    print(xml.decode(), end='')
 if __name__ == '__main__':
    data = Meta({
        'creator': MetaList([
            MetaMap({
                'file-as': MetaInlines([
                    Str('Riter,'),
                    Space(),
                    Str('E.Z.')]),
                'role': MetaInlines([
                    Str('aut')]),
                'text': MetaInlines([
                    Str('E.Z.'),
                    Space(),
                    Str('Riter')])})]),
        'description': MetaInlines([
            Str('Karen'),
            Space(),
            Str('meets'),
            Space(),
            Str('the'),
            Space(),
            Str('man'),
            Space(),
            Str('she'),
            Space(),
            Str('can’t'),
            Space(),
            Str('resist'),
            Space(),
            Str('—'),
            Space(),
            Str('the'),
            Space(),
            Str('man'),
            Space(),
            Str('who'),
            Space(),
            Str('can'),
            Space(),
            Str('do'),
            Space(),
            Str('anything'),
            Space(),
            Str('to'),
            Space(),
            Str('her,'),
            Space(),
            Str('and'),
            Space(),
            Str('she'),
            Space(),
            Str('will'),
            Space(),
            Str('love'),
            Space(),
            Str('it'),
            Space(),
            Str('—'),
            Space(),
            Str('and'),
            Space(),
            Str('him.')]),
        'language': MetaInlines([Str('en')]),
        'published': MetaInlines([Str('2002-07-24')]),
        'source': MetaInlines([Str('https://www.bdsmlibrary.info/stories/story.php?storyid=1101')]),
        'subject': MetaList([
            MetaInlines([Str('M/f')]),
            MetaInlines([Str('pregnant')]),
            MetaInlines([Str('spanking')]),
            MetaInlines([Str('D/s')]),
            MetaInlines([Str('real')]),
            MetaInlines([Str('reluctant')]),
            MetaInlines([Str('Serious')])]),
        'title': MetaMap({
            'file-as': MetaInlines([Str('Karen')]),
            'text': MetaInlines([Str('Karen')])})})
    doc = make_title_page(data)
    print(etree.tostring(doc, pretty_print=True, xml_declaration=True, encoding='utf-8').decode(), end='')
--- a/markepub/util.py
+++ b/markepub/util.py
@@ -0,0 +1,9 @@
 from lxml import etree
 def get_xhtml_template():
    html = etree.XML('<?xml version="1.0"?>'
                  '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
                  ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                  '<html xmlns="http://www.w3.org/1999/xhtml"></html>')
    tree = etree.ElementTree(html)
    return tree
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,10 @@
 [project]
 name = "markepub"
 version = "0.1.0"
 description = "Add your description here"
 requires-python = ">=3.13"
 dependencies = [
    "lxml>=6.0.1",
    "pandoc>=2.4",
    "pyyaml>=6.0.2",
 ]