stuff/cook.py

81 lines
3.4 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from zipfile import ZipFile
from pathlib import Path
import os, sys, time, re, shutil, subprocess
file_name = sys.argv[1]
files_to_work_on = []
files_to_zip = []
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
count = len(it)
start = time.time()
def show(j):
x = int(size*j/count)
print(f"{prefix}[{u''*x}{('.'*(size-x))}] {j}/{count}", end='\r', file=out, flush=True)
for i, item in enumerate(it):
yield item
show(i+1)
print("\n", flush=True, file=out)
class NoStdStreams(object):
def __init__(self,stdout = None, stderr = None):
self.devnull = open(os.devnull,'w')
self._stdout = stdout or self.devnull or sys.stdout
self._stderr = stderr or self.devnull or sys.stderr
def __enter__(self):
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
self.old_stdout.flush(); self.old_stderr.flush()
sys.stdout, sys.stderr = self._stdout, self._stderr
def __exit__(self, exc_type, exc_value, traceback):
self._stdout.flush(); self._stderr.flush()
sys.stdout = self.old_stdout
sys.stderr = self.old_stderr
self.devnull.close()
with ZipFile(file_name, 'r') as zip:
for filename in zip.namelist():
if re.search('.x?html?', filename):
zip.extract(filename)
files_to_work_on.append(filename)
Path("work").mkdir(parents=True, exist_ok=True)
Path(f"work/{files_to_work_on[0].split('/')[0]}").mkdir(parents=True, exist_ok=True)
print(f"Cooking on {file_name}, with {len(files_to_work_on)} files in it")
for file in progressbar(files_to_work_on, "", 40):
with open(file, 'r') as epub_file:
text = epub_file.read()
test = re.findall('id="toc"', text)
if test:
continue
matches = re.findall('<a class="hlink"(?:(?:href="(.*?)#(.*?)")*.*?)+>', text)
if matches:
for match in matches:
if match[0] != '':
for dd in files_to_work_on:
if re.search(f".*?{match[0]}.*?", dd):
with open(dd, 'r') as source:
source_match = re.search(f"<p class=\"hanging1\".*?id=\"{match[1]}\">.*?</p>", source.read()) # VHS: Change hanging1
if source_match:
source_match_fixed = re.sub('<p(?:(.*?)?(?:id=".*?")?)><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p\1 id={match[1]}>\2</p>', source_match.group())
source_match_fixed = re.sub('<p (.*?)</p>', r'<p epub:type="footnote" \1</p>', source_match_fixed)
fixed_text = re.sub(f"<a class=\"hlink\" ((?:id=\"{match[0]}\")*(?:href=\".*?#.*?\")*.*?)*>", f"<a epub:type=\"noteref\" class=\"hlink\" href=\"#{match[1]}\">", text)
text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
with open(f"work/{file}", 'w') as output:
output.write(text)
files_to_zip.append(file)
shutil.copy(file_name, "output.epub")
with ZipFile("output.epub", 'a') as zip:
with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
for file in files_to_zip:
zip.write(f"work/{file}", file)
# shutil.rmtree("work")