This commit is contained in:
Bernhard Radermacher
2025-09-21 19:43:16 +02:00
parent f6b9ff6a78
commit 80e30294db
9 changed files with 2340 additions and 0 deletions

1728
Karen.md Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,2 +1,10 @@
# markepub # markepub
normalize-md
inplace, take markdown and
- remove double empty lines
- add line break after every full-stop

0
markepub/__init__.py Normal file
View File

156
markepub/de_meta.py Normal file
View File

@@ -0,0 +1,156 @@
from collections.abc import Mapping, Sequence
import pandoc
from lxml import etree
from lxml.builder import E
# noinspection PyUnresolvedReferences
from pandoc.types import Meta, MetaMap, MetaList, MetaBool, MetaString, MetaInlines, MetaBlocks, Str, Emph, Underline, \
Strong, Strikeout, Superscript, Subscript, SmallCaps, Quoted, Cite, Code, Space, SoftBreak, LineBreak, Math
from util import get_xhtml_template
def resolve_inline(value):
if isinstance(value, Str): return value[0]
# if isinstance(value, Emph): return value
# if isinstance(value, Underline): return value
# if isinstance(value, Strong): return value
# if isinstance(value, Strikeout): return value
# if isinstance(value, Superscript): return value
# if isinstance(value, Subscript): return value
# if isinstance(value, SmallCaps): return value
# if isinstance(value, Quoted): return value
# if isinstance(value, Cite): return value
# if isinstance(value, Code): return value
if isinstance(value, Space): return ' '
if isinstance(value, SoftBreak): return ' '
if isinstance(value, LineBreak): return '\n'
# if isinstance(value, Math): return value
return value
def resolve_meta_value(value):
if isinstance(value, MetaMap): return PyMetaMap(value)
if isinstance(value, MetaList): return PyMetaList(value)
if isinstance(value, (MetaBool, MetaString)): return value[0]
if isinstance(value, MetaInlines): return PyMetaInlines(value)
if isinstance(value, MetaBlocks): return PyMetaBlocks(value)
return value
class _Sequence(Sequence):
def __getitem__(self, index: int):
return self._data[index]
def __len__(self) -> int:
return len(self._data)
def __init__(self, p) -> None:
self._data = [v for v in p[0]]
class PyMetaBlocks(_Sequence): pass
class PyMetaInlines(_Sequence): pass
# noinspection PyMissingConstructor
class PyMetaList(_Sequence):
def __init__(self, p) -> None:
self._data = [resolve_meta_value(v) for v in p[0]]
class PyMetaMap(Mapping):
def __getitem__(self, key: str, /):
return self._data[key]
def __len__(self) -> int:
return len(self._data)
def __iter__(self):
return iter(self._data)
def __init__(self, pandoc_meta):
self._data = {k: resolve_meta_value(v) for k, v in self._pandoc[0].items()}
if __name__ == '__main__':
data = Meta({
'creator': MetaList([
MetaMap({
'file-as': MetaInlines([
Str('Riter,'),
Space(),
Str('E.Z.')]),
'role': MetaInlines([
Str('aut')]),
'text': MetaInlines([
Str('E.Z.'),
Space(),
Str('Riter')])})]),
'description': MetaInlines([
Str('Karen'),
Space(),
Str('meets'),
Space(),
Str('the'),
Space(),
Str('man'),
Space(),
Str('she'),
Space(),
Str('cant'),
Space(),
Str('resist'),
Space(),
Str(''),
Space(),
Str('the'),
Space(),
Str('man'),
Space(),
Str('who'),
Space(),
Str('can'),
Space(),
Str('do'),
Space(),
Str('anything'),
Space(),
Str('to'),
Space(),
Str('her,'),
Space(),
Str('and'),
Space(),
Str('she'),
Space(),
Str('will'),
Space(),
Str('love'),
Space(),
Str('it'),
Space(),
Str(''),
Space(),
Str('and'),
Space(),
Str('him.')]),
'language': MetaInlines([Str('en')]),
'published': MetaInlines([Str('2002-07-24')]),
'source': MetaInlines([Str('https://www.bdsmlibrary.info/stories/story.php?storyid=1101')]),
'subject': MetaList([
MetaInlines([Str('M/f')]),
MetaInlines([Str('pregnant')]),
MetaInlines([Str('spanking')]),
MetaInlines([Str('D/s')]),
MetaInlines([Str('real')]),
MetaInlines([Str('reluctant')]),
MetaInlines([Str('Serious')])]),
'title': MetaMap({
'file-as': MetaInlines([Str('Karen')]),
'text': MetaInlines([Str('Karen')])})})
m = PyMetaMap(data)
print(m)

215
markepub/frontmatter.py Normal file
View File

@@ -0,0 +1,215 @@
import datetime
import uuid
from lxml import etree
from lxml.builder import E
import yaml
from markepub.util import get_xhtml_template
# DublinCore Elements:
# - contributor
# - coverage
# + creator
# + date
# + description
# - format
# + identifier
# + language
# + publisher
# - relation
# - rights
# + source
# + subject
# + title
# - type
# Calibre
# + series
# + series_index
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
OPF_NAMESPACE = "http://www.idpf.org/2007/opf"
DC = f'{{{DC_NAMESPACE}}}'
OPF = f'{{{OPF_NAMESPACE}}}'
class _Scalar:
CLASS = ''
TAG = DC + CLASS
def __init__(self, value):
self.value = value.strip() if isinstance(value, str) else value
def __str__(self):
return str(self.value)
def __repr__(self):
return self.value
@property
def element(self):
e = etree.Element(self.TAG)
e.text = self.value
return e
@property
def as_title_page(self):
return E.tr(E.td(self.value, **{'class': self.CLASS}))
class _HasFileAs(_Scalar):
def __init__(self, value: str, file_as: str = None):
super().__init__(value=value)
self.file_as = file_as.strip()
@classmethod
def from_yaml(cls, value: str | dict[str, str]):
if isinstance(value, str):
return cls(value=value)
return cls(value=value.get('text', value['value']), file_as=value.get('file-as', None))
@property
def element(self):
e = super().element
if self.file_as:
e.set(OPF + 'file-as', self.file_as)
return e
class Author(_HasFileAs):
CLASS = 'author'
TAG = DC + 'creator'
@property
def element(self):
e = super().element
e.set(OPF + 'role', 'aut')
return e
class Title(_HasFileAs):
CLASS = 'title'
class _Date(_Scalar):
TAG = DC + 'date'
class Published(_Date):
@property
def element(self):
e = super().element
e.set(OPF + 'event', 'publication')
return e
class Modified(_Date):
def __init__(self):
super().__init__(value=None)
@property
def element(self):
e = super().element
e.text = datetime.datetime.now(datetime.UTC).strftime('%Y-%m-%d %H-%M-%S')
e.set(OPF + 'event', 'modification')
return e
class Description(_Scalar):
CLASS = 'description'
class Identifier(_Scalar):
CLASS = 'identifier'
def __init__(self, value: str = None):
if value is None:
value = uuid.uuid4().urn
super().__init__(value=value)
@property
def element(self):
e = super().element
e.set('id', 'BookId')
e.set(OPF + 'scheme', 'UUID')
return e
class Language(_Scalar):
CLASS = 'language'
class Publisher(_Scalar):
CLASS = 'publisher'
class Source(_Scalar):
CLASS = 'source'
class Subject(_Scalar):
CLASS = 'Subject'
class _Calibre(_Scalar):
def element(self):
return etree.Element('meta', name=f'calibre:{self.TAG}', content=self.value)
class Series(_Calibre):
TAG = 'series'
class Index(_Scalar):
TAG = 'series_index'
class Frontmatter:
creators: list[Author] = None
description: Description = None
identifier: Identifier = None
index: Index = None
language: Language = None
published: Published = None
publisher: Publisher = None
series: Series = None
source: Source = None
subjects: list[Subject] = None
title: Title = None
def __init__(self, **kwargs):
self.creators = [kwargs['creator']] if 'creator' in kwargs else None
self.subjects = [kwargs['subject']] if 'subject' in kwargs else None
self.__dict__.update(kwargs)
@property
def as_title_page(self):
doc = get_xhtml_template()
root = doc.getroot()
root.append(
E.head(
E.title(self.title),
E.link(href="../Styles/title-page.css", type="text/css", rel="stylesheet"),
)
)
table_body = E.tbody()
root.append(E.body(E.table(table_body), **{'class': 'title-page'}))
if self.creators:
items = [str(self.creators[0])]
for i in range(1, len(self.creators)):
items.extend((E.br(), self.creators[i]))
table_body.append(E.tr(E.td(*items), **{'class': 'author'}))
table_body.append(self.title.as_title_page)
if self.series:
items = [str(self.series)]
if self.index:
items.extend((E.br(), str(self.index)))
table_body.append(E.tr(E.td(*items), **{'class': 'sub-title'}))
if self.description:
table_body.append(self.description.as_title_page)
return doc

49
markepub/normalize-md.py Normal file
View File

@@ -0,0 +1,49 @@
import re
def normalize_markdown(text: list[str]):
front_matter = []
if text[0] == "---":
text.pop(0)
while text:
line = text.pop(0)
if line == "---":
break
front_matter.append(line)
main_matter = []
buffer = []
while text:
line = text.pop(0)
if not line:
if len(buffer) > 0:
main_matter.append(' '.join(buffer))
buffer.clear()
else:
buffer.append(line)
if len(buffer) > 0:
main_matter.append(' '.join(buffer))
SPACES = re.compile(r' +')
ENDS = re.compile(r'([!?.])\s')
ENDS2 = re.compile(r'([!?.])”\s')
# normalize spaces...
for i in range(len(main_matter)):
main_matter[i] = SPACES.sub(' ', main_matter[i])
main_matter[i] = ENDS.sub(r'\1\n', main_matter[i])
main_matter[i] = ENDS2.sub(r'\1”\n', main_matter[i])
result = '---\n' + '\n'.join(front_matter) + '\n---\n\n' if front_matter else ''
return result + '\n\n'.join(main_matter)
if __name__ == '__main__':
with open('Karen.md', encoding='utf-8') as f:
result = normalize_markdown([l.rstrip() for l in f.readlines()])
with open('Karen.md', 'w', encoding='utf-8') as f:
f.write(result)

165
markepub/title_page.py Normal file
View File

@@ -0,0 +1,165 @@
from lxml.builder import E
from lxml import etree
import pandoc
from pandoc.types import *
from util import get_xhtml_template
def class_(*args): # class is a reserved word in Python
return {"class": ' '.join(args)}
def make_title_page(pandoc_meta: Meta) -> etree.ElementTree:
meta_dict = pandoc_meta[0]
items = []
if 'creator' in meta_dict:
# either scarlar (used as value)
# or a List
author = None
if isinstance(meta_dict['creator'], MetaInlines):
author = pandoc.write(meta_dict['creator'])
elif isinstance(meta_dict['creator'], MetaList):
for i in meta_dict['creator'][0]:
if 'role' in i[0] and i[0]['role'] != 'aut':
continue
if 'text' in i[0]:
author = pandoc.write(i[0]['text'])
if author:
items.append(E.tr(E.td(author.strip(), class_('author'))))
title = ''
if 'title' in meta_dict:
if isinstance(meta_dict['title'], MetaInlines):
title = pandoc.write(meta_dict['title'])
elif isinstance(meta_dict['title'], MetaMap):
if 'text' in meta_dict['title'][0]:
title = pandoc.write(meta_dict['title'][0]['text'])
if title:
items.append(E.tr(E.td(title.strip(), class_('title'))))
if 'sub-title' in meta_dict:
sub_title = None
if isinstance(meta_dict['sub-title'], MetaInlines):
sub_title = pandoc.write(meta_dict['sub-title'])
elif isinstance(meta_dict['sub-title'], MetaMap):
if 'text' in meta_dict['sub-title'][0]:
sub_title = pandoc.write(meta_dict['sub-title'][0]['text'])
if sub_title:
items.append(E.tr(E.td(sub_title.strip(), class_('sub-title'))))
if 'description' in meta_dict:
description = None
if isinstance(meta_dict['description'], MetaInlines):
description = pandoc.write(meta_dict['description'],
format='html5',
options=["--wrap=none"]
)
if description:
items.append(E.tr(E.td(description.strip(), class_('description'))))
doc = get_xhtml_template()
root = doc.getroot()
root.append(
E.head(
E.title(title.strip()),
E.link(href="../Styles/title-page.css", type = "text/css", rel = "stylesheet"),
)
)
if items:
root.append(
E.body(
E.table(
E.tbody(*items),
)
)
)
return doc
def prettyprint(element, **kwargs):
xml = etree.tostring(element, pretty_print=True, **kwargs)
print(xml.decode(), end='')
if __name__ == '__main__':
data = Meta({
'creator': MetaList([
MetaMap({
'file-as': MetaInlines([
Str('Riter,'),
Space(),
Str('E.Z.')]),
'role': MetaInlines([
Str('aut')]),
'text': MetaInlines([
Str('E.Z.'),
Space(),
Str('Riter')])})]),
'description': MetaInlines([
Str('Karen'),
Space(),
Str('meets'),
Space(),
Str('the'),
Space(),
Str('man'),
Space(),
Str('she'),
Space(),
Str('cant'),
Space(),
Str('resist'),
Space(),
Str(''),
Space(),
Str('the'),
Space(),
Str('man'),
Space(),
Str('who'),
Space(),
Str('can'),
Space(),
Str('do'),
Space(),
Str('anything'),
Space(),
Str('to'),
Space(),
Str('her,'),
Space(),
Str('and'),
Space(),
Str('she'),
Space(),
Str('will'),
Space(),
Str('love'),
Space(),
Str('it'),
Space(),
Str(''),
Space(),
Str('and'),
Space(),
Str('him.')]),
'language': MetaInlines([Str('en')]),
'published': MetaInlines([Str('2002-07-24')]),
'source': MetaInlines([Str('https://www.bdsmlibrary.info/stories/story.php?storyid=1101')]),
'subject': MetaList([
MetaInlines([Str('M/f')]),
MetaInlines([Str('pregnant')]),
MetaInlines([Str('spanking')]),
MetaInlines([Str('D/s')]),
MetaInlines([Str('real')]),
MetaInlines([Str('reluctant')]),
MetaInlines([Str('Serious')])]),
'title': MetaMap({
'file-as': MetaInlines([Str('Karen')]),
'text': MetaInlines([Str('Karen')])})})
doc = make_title_page(data)
print(etree.tostring(doc, pretty_print=True, xml_declaration=True, encoding='utf-8').decode(), end='')

9
markepub/util.py Normal file
View File

@@ -0,0 +1,9 @@
from lxml import etree
def get_xhtml_template():
html = etree.XML('<?xml version="1.0"?>'
'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
'<html xmlns="http://www.w3.org/1999/xhtml"></html>')
tree = etree.ElementTree(html)
return tree

10
pyproject.toml Normal file
View File

@@ -0,0 +1,10 @@
[project]
name = "markepub"
version = "0.1.0"
description = "Add your description here"
requires-python = ">=3.13"
dependencies = [
"lxml>=6.0.1",
"pandoc>=2.4",
"pyyaml>=6.0.2",
]