stuff/cook.py

97 lines
4.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from zipfile import ZipFile
from pathlib import Path
import os, sys, time, re, shutil, argparse
parser = argparse.ArgumentParser("epub cooking")
parser.add_argument("--cc", metavar="CHAPTERS_CLASS", help="The class used in chapters. (default: hlink)", type=str)
parser.add_argument("--fc", metavar="FOOTNOTES_CLASS", help="The class used in footnotes. (default: hanging1)", type=str)
#parser.add_argument("--test", help="Test a link or footnote against the regex. (NotImplemented)", type=bool)
parser.add_argument("file", help="The file to be worked on.", type=str)
args = parser.parse_args()
file_name = args.file
files_to_work_on = []
files_to_zip = []
if args.cc:
chapter_class = args.cc
else:
chapter_class = "hlink" #VHS: Change hlink, this is the class found in the chapters
if args.fc:
footnotes_class = args.fc
else:
footnotes_class = "hanging1" # VHS: Change hanging1, this is the class found in the footnotes
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
count = len(it)
start = time.time()
def show(j):
x = int(size*j/count)
print(f"{prefix}[{u''*x}{('.'*(size-x))}] {j}/{count}", end='\r', file=out, flush=True)
for i, item in enumerate(it):
yield item
show(i+1)
print("\n", flush=True, file=out)
class NoStdStreams(object):
def __init__(self,stdout = None, stderr = None):
self.devnull = open(os.devnull,'w')
self._stdout = stdout or self.devnull or sys.stdout
self._stderr = stderr or self.devnull or sys.stderr
def __enter__(self):
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
self.old_stdout.flush(); self.old_stderr.flush()
sys.stdout, sys.stderr = self._stdout, self._stderr
def __exit__(self, exc_type, exc_value, traceback):
self._stdout.flush(); self._stderr.flush()
sys.stdout = self.old_stdout
sys.stderr = self.old_stderr
self.devnull.close()
with ZipFile(file_name, 'r') as zip:
for filename in zip.namelist():
if re.search('.x?html?', filename):
zip.extract(filename)
files_to_work_on.append(filename)
Path("work").mkdir(parents=True, exist_ok=True)
Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)
print(f"Cooking on {file_name}, with {len(files_to_work_on)} files in it")
for file in files_to_work_on: #progressbar(files_to_work_on, "", 40):
with open(file, 'r') as epub_file:
text = epub_file.read()
test = re.findall('id="toc"|epub:type="toc"', text)
if test:
continue
matches = re.findall(f'<a (?:(?:class="{chapter_class}")|(?:href="(.*?)#(.*?)")|(?:.*?))+>', text)
if matches:
for match in matches:
if match[0] != '':
for dd in files_to_work_on:
if re.search(f".*?{match[0]}.*?", dd):
with open(dd, 'r') as source:
source_match = re.search(f"<p class=\"{footnotes_class}\".*?id=\"{match[1]}\".*?</p>", source.read())
if source_match:
source_match_fixed = re.sub('<p(?:(.*?)?(?:id=".*?")?)><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p\1 id={match[1]}>\2</p>', source_match.group())
source_match_fixed = re.sub('<p (.*?)</p>', r'<p epub:type="footnote" \1</p>', source_match_fixed)
fixed_text = re.sub(f"<a (?:(?:class=\"{chapter_class}\")|(?:href=\".*?#{match[1]}\")|(?:.*?))+>", f"<a epub:type=\"noteref\" class=\"{chapter_class}\" href=\"#{match[1]}\">", text)
text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
with open(f"work/{file}", 'w') as output:
output.write(text)
files_to_zip.append(file)
shutil.copy(file_name, "output.epub")
with ZipFile("output.epub", 'a') as zip:
with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
for file in files_to_zip:
zip.write(f"work/{file}", file)
shutil.rmtree("work")