stuff/cook.py

from zipfile import ZipFile
from pathlib import Path
import os, sys, time, re, shutil, argparse

parser = argparse.ArgumentParser("epub cooking")
parser.add_argument("--cc", metavar="CHAPTERS_CLASS", help="The class used in chapters. (default: hlink)", type=str)
parser.add_argument("--fc", metavar="FOOTNOTES_CLASS", help="The class used in footnotes. (default: hanging1)", type=str)
#parser.add_argument("--test", help="Test a link or footnote against the regex. (NotImplemented)", type=bool)
parser.add_argument("file", help="The file to be worked on.", type=str)
args = parser.parse_args()

file_name = args.file
files_to_work_on = []
files_to_zip = []

if args.cc:
    chapter_class = args.cc
else:
    chapter_class = "hlink" #VHS: Change hlink, this is the class found in the chapters
if args.fc:
    footnotes_class = args.fc
else:
    footnotes_class = "hanging1" # VHS: Change hanging1, this is the class found in the footnotes

def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
    count = len(it)
    start = time.time()
    def show(j):
        x = int(size*j/count)

        print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count}", end='\r', file=out, flush=True)

    for i, item in enumerate(it):
        yield item
        show(i+1)
    print("\n", flush=True, file=out)

class NoStdStreams(object):
    def __init__(self,stdout = None, stderr = None):
        self.devnull = open(os.devnull,'w')
        self._stdout = stdout or self.devnull or sys.stdout
        self._stderr = stderr or self.devnull or sys.stderr

    def __enter__(self):
        self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
        self.old_stdout.flush(); self.old_stderr.flush()
        sys.stdout, sys.stderr = self._stdout, self._stderr

    def __exit__(self, exc_type, exc_value, traceback):
        self._stdout.flush(); self._stderr.flush()
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr
        self.devnull.close()

with ZipFile(file_name, 'r') as zip:
    for filename in zip.namelist():
        if re.search('.x?html?', filename):
            zip.extract(filename)
            files_to_work_on.append(filename)

Path("work").mkdir(parents=True, exist_ok=True)
Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)

print(f"Cooking on {file_name}, with {len(files_to_work_on)} files in it")

for file in files_to_work_on: #progressbar(files_to_work_on, "", 40):
    with open(file, 'r') as epub_file:
        text = epub_file.read()
        test = re.findall('id="toc"|epub:type="toc"', text)
        if test:
            continue
        matches = re.findall(f'<a (?:(?:class="{chapter_class}")|(?:href="(.*?)#(.*?)")|(?:.*?))+>', text)
        if matches:
            for match in matches:
                if match[0] != '':
                    for dd in files_to_work_on:
                        if re.search(f".*?{match[0]}.*?", dd):
                            with open(dd, 'r') as source:
                                source_match = re.search(f"<p class=\"{footnotes_class}\".*?id=\"{match[1]}\".*?</p>", source.read())
                                if source_match:
                                    source_match_fixed = re.sub('<p(?:(.*?)?(?:id=".*?")?)><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p\1 id={match[1]}>\2</p>', source_match.group())
                                    source_match_fixed = re.sub('<p (.*?)</p>', r'<p epub:type="footnote" \1</p>', source_match_fixed)
                                    fixed_text = re.sub(f"<a (?:(?:class=\"{chapter_class}\")|(?:href=\".*?#{match[1]}\")|(?:.*?))+>", f"<a epub:type=\"noteref\" class=\"{chapter_class}\" href=\"#{match[1]}\">", text)
                                    text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
            with open(f"work/{file}", 'w') as output:
                output.write(text)
            files_to_zip.append(file)

shutil.copy(file_name, "output.epub")

with ZipFile("output.epub", 'a') as zip:
    with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
        for file in files_to_zip:
            zip.write(f"work/{file}", file)

shutil.rmtree("work")