wip

2025-09-21 19:43:16 +02:00
parent f6b9ff6a78
commit 80e30294db
9 changed files with 2340 additions and 0 deletions
--- a/Karen.md
+++ b/Karen.md
--- a/README.md
+++ b/README.md
@@ -1,2 +1,10 @@
 # markepub

+
+normalize-md
+
+inplace, take markdown and 
+
+- remove double empty lines
+- add line break after every full-stop
+
--- a/markepub/init.py
+++ b/markepub/init.py
--- a/markepub/de_meta.py
+++ b/markepub/de_meta.py
@@ -0,0 +1,156 @@
+from collections.abc import Mapping, Sequence
+
+import pandoc
+from lxml import etree
+from lxml.builder import E
+# noinspection PyUnresolvedReferences
+from pandoc.types import Meta, MetaMap, MetaList, MetaBool, MetaString, MetaInlines, MetaBlocks, Str, Emph, Underline, \
+    Strong, Strikeout, Superscript, Subscript, SmallCaps, Quoted, Cite, Code, Space, SoftBreak, LineBreak, Math
+
+from util import get_xhtml_template
+
+
+def resolve_inline(value):
+    if isinstance(value, Str): return value[0]
+    # if isinstance(value, Emph): return value
+    # if isinstance(value, Underline): return value
+    # if isinstance(value, Strong): return value
+    # if isinstance(value, Strikeout): return value
+    # if isinstance(value, Superscript): return value
+    # if isinstance(value, Subscript): return value
+    # if isinstance(value, SmallCaps): return value
+    # if isinstance(value, Quoted): return value
+    # if isinstance(value, Cite): return value
+    # if isinstance(value, Code): return value
+    if isinstance(value, Space): return ' '
+    if isinstance(value, SoftBreak): return ' '
+    if isinstance(value, LineBreak): return '\n'
+    # if isinstance(value, Math): return value
+    return value
+
+def resolve_meta_value(value):
+    if isinstance(value, MetaMap): return PyMetaMap(value)
+    if isinstance(value, MetaList): return PyMetaList(value)
+    if isinstance(value, (MetaBool, MetaString)): return value[0]
+    if isinstance(value, MetaInlines): return PyMetaInlines(value)
+    if isinstance(value, MetaBlocks): return PyMetaBlocks(value)
+    return value
+
+class _Sequence(Sequence):
+
+    def __getitem__(self, index: int):
+        return self._data[index]
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __init__(self, p) -> None:
+        self._data = [v for v in p[0]]
+
+
+class PyMetaBlocks(_Sequence): pass
+class PyMetaInlines(_Sequence): pass
+
+
+# noinspection PyMissingConstructor
+class PyMetaList(_Sequence):
+
+    def __init__(self, p) -> None:
+        self._data = [resolve_meta_value(v) for v in p[0]]
+
+
+class PyMetaMap(Mapping):
+
+    def __getitem__(self, key: str, /):
+        return self._data[key]
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __init__(self, pandoc_meta):
+        self._data = {k: resolve_meta_value(v) for k, v in self._pandoc[0].items()}
+
+
+if __name__ == '__main__':
+    data = Meta({
+        'creator': MetaList([
+            MetaMap({
+                'file-as': MetaInlines([
+                    Str('Riter,'),
+                    Space(),
+                    Str('E.Z.')]),
+                'role': MetaInlines([
+                    Str('aut')]),
+                'text': MetaInlines([
+                    Str('E.Z.'),
+                    Space(),
+                    Str('Riter')])})]),
+        'description': MetaInlines([
+            Str('Karen'),
+            Space(),
+            Str('meets'),
+            Space(),
+            Str('the'),
+            Space(),
+            Str('man'),
+            Space(),
+            Str('she'),
+            Space(),
+            Str('can’t'),
+            Space(),
+            Str('resist'),
+            Space(),
+            Str('—'),
+            Space(),
+            Str('the'),
+            Space(),
+            Str('man'),
+            Space(),
+            Str('who'),
+            Space(),
+            Str('can'),
+            Space(),
+            Str('do'),
+            Space(),
+            Str('anything'),
+            Space(),
+            Str('to'),
+            Space(),
+            Str('her,'),
+            Space(),
+            Str('and'),
+            Space(),
+            Str('she'),
+            Space(),
+            Str('will'),
+            Space(),
+            Str('love'),
+            Space(),
+            Str('it'),
+            Space(),
+            Str('—'),
+            Space(),
+            Str('and'),
+            Space(),
+            Str('him.')]),
+        'language': MetaInlines([Str('en')]),
+        'published': MetaInlines([Str('2002-07-24')]),
+        'source': MetaInlines([Str('https://www.bdsmlibrary.info/stories/story.php?storyid=1101')]),
+        'subject': MetaList([
+            MetaInlines([Str('M/f')]),
+            MetaInlines([Str('pregnant')]),
+            MetaInlines([Str('spanking')]),
+            MetaInlines([Str('D/s')]),
+            MetaInlines([Str('real')]),
+            MetaInlines([Str('reluctant')]),
+            MetaInlines([Str('Serious')])]),
+        'title': MetaMap({
+            'file-as': MetaInlines([Str('Karen')]),
+            'text': MetaInlines([Str('Karen')])})})
+
+    m = PyMetaMap(data)
+
+    print(m)
--- a/markepub/frontmatter.py
+++ b/markepub/frontmatter.py
@@ -0,0 +1,215 @@
+import datetime
+import uuid
+from lxml import etree
+from lxml.builder import E
+
+import yaml
+
+from markepub.util import get_xhtml_template
+
+# DublinCore Elements:
+# - contributor
+# - coverage
+# + creator
+# + date
+# + description
+# - format
+# + identifier
+# + language
+# + publisher
+# - relation
+# - rights
+# + source
+# + subject
+# + title
+# - type
+
+# Calibre
+# + series
+# + series_index
+
+
+DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
+OPF_NAMESPACE = "http://www.idpf.org/2007/opf"
+DC = f'{{{DC_NAMESPACE}}}'
+OPF = f'{{{OPF_NAMESPACE}}}'
+
+
+
+
+class _Scalar:
+    CLASS = ''
+    TAG = DC + CLASS
+
+    def __init__(self, value):
+        self.value = value.strip() if isinstance(value, str) else value
+
+    def __str__(self):
+        return str(self.value)
+
+    def __repr__(self):
+        return self.value
+
+    @property
+    def element(self):
+        e = etree.Element(self.TAG)
+        e.text = self.value
+        return e
+
+    @property
+    def as_title_page(self):
+        return E.tr(E.td(self.value, **{'class': self.CLASS}))
+
+
+class _HasFileAs(_Scalar):
+
+    def __init__(self, value: str, file_as: str = None):
+        super().__init__(value=value)
+        self.file_as = file_as.strip()
+
+    @classmethod
+    def from_yaml(cls, value: str | dict[str, str]):
+        if isinstance(value, str):
+            return cls(value=value)
+        return cls(value=value.get('text', value['value']), file_as=value.get('file-as', None))
+
+    @property
+    def element(self):
+        e = super().element
+        if self.file_as:
+            e.set(OPF + 'file-as', self.file_as)
+        return e
+
+
+class Author(_HasFileAs):
+    CLASS = 'author'
+    TAG = DC + 'creator'
+
+    @property
+    def element(self):
+        e = super().element
+        e.set(OPF + 'role', 'aut')
+        return e
+
+class Title(_HasFileAs):
+    CLASS = 'title'
+
+
+class _Date(_Scalar):
+    TAG = DC + 'date'
+
+
+class Published(_Date):
+
+    @property
+    def element(self):
+        e = super().element
+        e.set(OPF + 'event', 'publication')
+        return e
+
+class Modified(_Date):
+
+    def __init__(self):
+        super().__init__(value=None)
+
+    @property
+    def element(self):
+        e = super().element
+        e.text = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d %H-%M-%S')
+        e.set(OPF + 'event', 'modification')
+        return e
+
+class Description(_Scalar):
+    CLASS = 'description'
+
+
+class Identifier(_Scalar):
+    CLASS = 'identifier'
+
+    def __init__(self, value: str = None):
+        if value is None:
+            value =  uuid.uuid4().urn
+        super().__init__(value=value)
+
+    @property
+    def element(self):
+        e = super().element
+        e.set('id', 'BookId')
+        e.set(OPF + 'scheme', 'UUID')
+        return e
+
+class Language(_Scalar):
+    CLASS = 'language'
+
+class Publisher(_Scalar):
+    CLASS = 'publisher'
+
+class Source(_Scalar):
+    CLASS = 'source'
+
+class Subject(_Scalar):
+    CLASS = 'Subject'
+
+class _Calibre(_Scalar):
+
+    def element(self):
+        return etree.Element('meta', name=f'calibre:{self.TAG}', content=self.value)
+
+class Series(_Calibre):
+    TAG = 'series'
+
+class Index(_Scalar):
+    TAG = 'series_index'
+
+
+
+class Frontmatter:
+    creators: list[Author] = None
+    description: Description = None
+    identifier: Identifier = None
+    index: Index = None
+    language: Language = None
+    published: Published = None
+    publisher: Publisher = None
+    series: Series = None
+    source: Source = None
+    subjects: list[Subject] = None
+    title: Title = None
+
+    def __init__(self, **kwargs):
+        self.creators = [kwargs['creator']] if 'creator' in kwargs else None
+        self.subjects = [kwargs['subject']] if 'subject' in kwargs else None
+        self.__dict__.update(kwargs)
+
+    @property
+    def as_title_page(self):
+        doc = get_xhtml_template()
+        root = doc.getroot()
+        root.append(
+            E.head(
+                E.title(self.title),
+                E.link(href="../Styles/title-page.css", type="text/css", rel="stylesheet"),
+            )
+        )
+        table_body = E.tbody()
+        root.append(E.body(E.table(table_body), **{'class': 'title-page'}))
+
+        if self.creators:
+            items = [str(self.creators[0])]
+            for i in range(1, len(self.creators)):
+                items.extend((E.br(), self.creators[i]))
+            table_body.append(E.tr(E.td(*items), **{'class': 'author'}))
+
+        table_body.append(self.title.as_title_page)
+
+        if self.series:
+            items = [str(self.series)]
+            if self.index:
+                items.extend((E.br(), str(self.index)))
+            table_body.append(E.tr(E.td(*items), **{'class': 'sub-title'}))
+
+        if self.description:
+            table_body.append(self.description.as_title_page)
+
+        return doc
+
--- a/markepub/normalize-md.py
+++ b/markepub/normalize-md.py
@@ -0,0 +1,49 @@
+import re
+
+
+def normalize_markdown(text: list[str]):
+
+    front_matter = []
+
+    if text[0] == "---":
+        text.pop(0)
+        while text:
+            line = text.pop(0)
+            if line == "---":
+                break
+            front_matter.append(line)
+
+    main_matter = []
+    buffer = []
+
+    while text:
+        line = text.pop(0)
+        if not line:
+            if len(buffer) > 0:
+                main_matter.append(' '.join(buffer))
+                buffer.clear()
+        else:
+            buffer.append(line)
+
+    if len(buffer) > 0:
+        main_matter.append(' '.join(buffer))
+
+    SPACES = re.compile(r' +')
+    ENDS = re.compile(r'([!?.])\s')
+    ENDS2 = re.compile(r'([!?.])”\s')
+    # normalize spaces...
+    for i in range(len(main_matter)):
+        main_matter[i] = SPACES.sub(' ', main_matter[i])
+        main_matter[i] = ENDS.sub(r'\1\n', main_matter[i])
+        main_matter[i] = ENDS2.sub(r'\1”\n', main_matter[i])
+
+    result = '---\n' + '\n'.join(front_matter) + '\n---\n\n' if front_matter else ''
+    return result + '\n\n'.join(main_matter)
+
+
+if __name__ == '__main__':
+    with open('Karen.md', encoding='utf-8') as f:
+        result = normalize_markdown([l.rstrip() for l in f.readlines()])
+
+    with open('Karen.md', 'w', encoding='utf-8') as f:
+        f.write(result)
--- a/markepub/title_page.py
+++ b/markepub/title_page.py
@@ -0,0 +1,165 @@
+from lxml.builder import E
+from lxml import etree
+import pandoc
+from pandoc.types import *
+from util import get_xhtml_template
+
+
+def class_(*args):  # class is a reserved word in Python
+    return {"class": ' '.join(args)}
+
+
+def make_title_page(pandoc_meta: Meta) -> etree.ElementTree:
+    meta_dict = pandoc_meta[0]
+    items = []
+    if 'creator' in meta_dict:
+        # either scarlar (used as value)
+        # or a List
+        author = None
+        if isinstance(meta_dict['creator'], MetaInlines):
+            author = pandoc.write(meta_dict['creator'])
+        elif isinstance(meta_dict['creator'], MetaList):
+            for i in meta_dict['creator'][0]:
+                if 'role' in i[0] and i[0]['role'] != 'aut':
+                    continue
+                if 'text' in i[0]:
+                    author = pandoc.write(i[0]['text'])
+        if author:
+            items.append(E.tr(E.td(author.strip(), class_('author'))))
+
+    title = ''
+    if 'title' in meta_dict:
+        if isinstance(meta_dict['title'], MetaInlines):
+            title = pandoc.write(meta_dict['title'])
+        elif isinstance(meta_dict['title'], MetaMap):
+            if 'text' in meta_dict['title'][0]:
+                title = pandoc.write(meta_dict['title'][0]['text'])
+        if title:
+            items.append(E.tr(E.td(title.strip(), class_('title'))))
+
+
+    if 'sub-title' in meta_dict:
+        sub_title = None
+        if isinstance(meta_dict['sub-title'], MetaInlines):
+            sub_title = pandoc.write(meta_dict['sub-title'])
+        elif isinstance(meta_dict['sub-title'], MetaMap):
+            if 'text' in meta_dict['sub-title'][0]:
+                sub_title = pandoc.write(meta_dict['sub-title'][0]['text'])
+        if sub_title:
+            items.append(E.tr(E.td(sub_title.strip(), class_('sub-title'))))
+
+
+    if 'description' in meta_dict:
+        description = None
+        if isinstance(meta_dict['description'], MetaInlines):
+            description = pandoc.write(meta_dict['description'],
+                                       format='html5',
+                                       options=["--wrap=none"]
+                                       )
+        if description:
+            items.append(E.tr(E.td(description.strip(), class_('description'))))
+
+    doc = get_xhtml_template()
+    root = doc.getroot()
+    root.append(
+        E.head(
+            E.title(title.strip()),
+            E.link(href="../Styles/title-page.css", type = "text/css", rel = "stylesheet"),
+        )
+    )
+    if items:
+        root.append(
+            E.body(
+                E.table(
+                    E.tbody(*items),
+                )
+            )
+        )
+
+    return doc
+
+
+def prettyprint(element, **kwargs):
+    xml = etree.tostring(element, pretty_print=True, **kwargs)
+    print(xml.decode(), end='')
+
+if __name__ == '__main__':
+    data = Meta({
+        'creator': MetaList([
+            MetaMap({
+                'file-as': MetaInlines([
+                    Str('Riter,'),
+                    Space(),
+                    Str('E.Z.')]),
+                'role': MetaInlines([
+                    Str('aut')]),
+                'text': MetaInlines([
+                    Str('E.Z.'),
+                    Space(),
+                    Str('Riter')])})]),
+        'description': MetaInlines([
+            Str('Karen'),
+            Space(),
+            Str('meets'),
+            Space(),
+            Str('the'),
+            Space(),
+            Str('man'),
+            Space(),
+            Str('she'),
+            Space(),
+            Str('can’t'),
+            Space(),
+            Str('resist'),
+            Space(),
+            Str('—'),
+            Space(),
+            Str('the'),
+            Space(),
+            Str('man'),
+            Space(),
+            Str('who'),
+            Space(),
+            Str('can'),
+            Space(),
+            Str('do'),
+            Space(),
+            Str('anything'),
+            Space(),
+            Str('to'),
+            Space(),
+            Str('her,'),
+            Space(),
+            Str('and'),
+            Space(),
+            Str('she'),
+            Space(),
+            Str('will'),
+            Space(),
+            Str('love'),
+            Space(),
+            Str('it'),
+            Space(),
+            Str('—'),
+            Space(),
+            Str('and'),
+            Space(),
+            Str('him.')]),
+        'language': MetaInlines([Str('en')]),
+        'published': MetaInlines([Str('2002-07-24')]),
+        'source': MetaInlines([Str('https://www.bdsmlibrary.info/stories/story.php?storyid=1101')]),
+        'subject': MetaList([
+            MetaInlines([Str('M/f')]),
+            MetaInlines([Str('pregnant')]),
+            MetaInlines([Str('spanking')]),
+            MetaInlines([Str('D/s')]),
+            MetaInlines([Str('real')]),
+            MetaInlines([Str('reluctant')]),
+            MetaInlines([Str('Serious')])]),
+        'title': MetaMap({
+            'file-as': MetaInlines([Str('Karen')]),
+            'text': MetaInlines([Str('Karen')])})})
+
+    doc = make_title_page(data)
+
+    print(etree.tostring(doc, pretty_print=True, xml_declaration=True, encoding='utf-8').decode(), end='')
--- a/markepub/util.py
+++ b/markepub/util.py
@@ -0,0 +1,9 @@
+from lxml import etree
+
+def get_xhtml_template():
+    html = etree.XML('<?xml version="1.0"?>'
+                  '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
+                  ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
+                  '<html xmlns="http://www.w3.org/1999/xhtml"></html>')
+    tree = etree.ElementTree(html)
+    return tree
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,10 @@
+[project]
+name = "markepub"
+version = "0.1.0"
+description = "Add your description here"
+requires-python = ">=3.13"
+dependencies = [
+    "lxml>=6.0.1",
+    "pandoc>=2.4",
+    "pyyaml>=6.0.2",
+]