stuff/cook.py
2024-03-03 01:38:31 +01:00

97 lines
4.6 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from zipfile import ZipFile
from pathlib import Path
import os, sys, re, shutil, argparse
parser = argparse.ArgumentParser("epub cooking")
#parser.add_argument("--test", help="Test a link or footnote against the regex. (NotImplemented)", type=bool)
parser.add_argument("file", help="The file to be worked on.", type=str)
args = parser.parse_args()
file_name = args.file
files_to_work_on = []
files_to_zip = []
lastItemLength = 1
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
count = len(it)
def show(j, item):
global lastItemLength
x = int(size*j/count)
spacesToPrint = lastItemLength - len(item)
print(f"{prefix}[{u''*x}{('.'*(size-x))}] {j}/{count} (Currently working on: {item}){' '*spacesToPrint}", end='\r', file=out, flush=True)
lastItemLength = len(item)
for i, item in enumerate(it):
show(i+1, item)
yield item
print(f"{prefix}[{u''*size}] {count}/{count} {' '*lastItemLength}", end='\r', file=out, flush=True)
print("\n", flush=True, file=out)
class NoStdStreams(object):
def __init__(self,stdout = None, stderr = None):
self.devnull = open(os.devnull,'w')
self._stdout = stdout or self.devnull or sys.stdout
self._stderr = stderr or self.devnull or sys.stderr
def __enter__(self):
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
self.old_stdout.flush(); self.old_stderr.flush()
sys.stdout, sys.stderr = self._stdout, self._stderr
def __exit__(self, exc_type, exc_value, traceback):
self._stdout.flush(); self._stderr.flush()
sys.stdout = self.old_stdout
sys.stderr = self.old_stderr
self.devnull.close()
with ZipFile(file_name, 'r') as zip:
for filename in zip.namelist():
if re.search('\.x?html?$', filename):
if not re.search('titlepage\.x?html?$', filename):
zip.extract(filename)
files_to_work_on.append(filename)
Path("work").mkdir(parents=True, exist_ok=True)
Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)
print(f"Cooking on {file_name}, with {len(files_to_work_on)} (x)html files in it")
for file in progressbar(files_to_work_on, "", 40):
with open(file, 'r') as epub_file:
text = epub_file.read()
test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
if test:
print(f"Skipping {file}, it seems to not be a chapter{' '*42}")
continue
matches = re.findall(f'<a[\w\d\s="-]*?href="(.*?)#(.*?)"[\w\d\s="-]*?>', text)
if matches:
for match in matches:
if match[0] != '' and not re.search("toc\.x?html?", match[0]):
for dd in files_to_work_on:
if re.search(f".*?{match[0]}.*?", dd):
with open(dd, 'r') as source:
source_text = source.read()
test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
if test:
continue
source_match = re.search(f"<p.*?id=\"{match[1]}\".*?</p>", source_text)
if source_match:
fixed_text = re.sub('<a id=".*?"(?:><a)?\/>', '', text) # These are stupid, we can just as well get rid of them. Sub below also catches them.
fixed_text = re.sub(f'<a(?! epub:type="noteref")(?!.*?\/>).*?href="{match[0]}#{match[1]}".*?>', fr'<a epub:type="noteref" href="#{match[1]}">', fixed_text)
source_match_fixed = re.sub('<p.*?><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p epub:type="footnote" id="{match[1]}">\1</p>', source_match.group())
text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
with open(f"work/{file}", 'w') as output:
output.write(text)
files_to_zip.append(file)
shutil.copy(file_name, "output.epub")
with ZipFile("output.epub", 'a') as zip:
with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
for file in files_to_zip:
zip.write(f"work/{file}", file)
shutil.rmtree("work")
shutil.rmtree(f"{files_to_work_on[2].split('/')[0]}")