from zipfile import ZipFile from pathlib import Path import os, sys, re, shutil, argparse parser = argparse.ArgumentParser("epub cooking") #parser.add_argument("--test", help="Test a link or footnote against the regex. (NotImplemented)", type=bool) parser.add_argument("file", help="The file to be worked on.", type=str) args = parser.parse_args() file_name = args.file files_to_work_on = [] files_to_zip = [] lastItemLength = 1 def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+ count = len(it) def show(j, item): global lastItemLength x = int(size*j/count) spacesToPrint = lastItemLength - len(item) print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} (Currently working on: {item}){' '*spacesToPrint}", end='\r', file=out, flush=True) lastItemLength = len(item) for i, item in enumerate(it): show(i+1, item) yield item print(f"{prefix}[{u'█'*size}] {count}/{count} {' '*lastItemLength}", end='\r', file=out, flush=True) print("\n", flush=True, file=out) class NoStdStreams(object): def __init__(self,stdout = None, stderr = None): self.devnull = open(os.devnull,'w') self._stdout = stdout or self.devnull or sys.stdout self._stderr = stderr or self.devnull or sys.stderr def __enter__(self): self.old_stdout, self.old_stderr = sys.stdout, sys.stderr self.old_stdout.flush(); self.old_stderr.flush() sys.stdout, sys.stderr = self._stdout, self._stderr def __exit__(self, exc_type, exc_value, traceback): self._stdout.flush(); self._stderr.flush() sys.stdout = self.old_stdout sys.stderr = self.old_stderr self.devnull.close() with ZipFile(file_name, 'r') as zip: for filename in zip.namelist(): if re.search('\.x?html?$', filename): if not re.search('titlepage\.x?html?$', filename): zip.extract(filename) files_to_work_on.append(filename) Path("work").mkdir(parents=True, exist_ok=True) Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True) print(f"Cooking on {file_name}, with {len(files_to_work_on)} (x)html files in it") for file in progressbar(files_to_work_on, "", 40): with open(file, 'r') as epub_file: text = epub_file.read() test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text) if test: print(f"Skipping {file}, it seems to not be a chapter{' '*42}") continue matches = re.findall(f'', text) if matches: for match in matches: if match[0] != '' and not re.search("toc\.x?html?", match[0]): for dd in files_to_work_on: if re.search(f".*?{match[0]}.*?", dd): with open(dd, 'r') as source: source_text = source.read() test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text) if test: continue source_match = re.search(f"", source_text) if source_match: fixed_text = re.sub('', '', text) # These are stupid, we can just as well get rid of them. Sub below also catches them. fixed_text = re.sub(f').*?href="{match[0]}#{match[1]}".*?>', fr'', fixed_text) source_match_fixed = re.sub('[\.\s ]*(.*?)<\/p>', rf'

\1

', source_match.group()) text = re.sub(f"\n\s*", f"\n{source_match_fixed}\n", fixed_text) with open(f"work/{file}", 'w') as output: output.write(text) files_to_zip.append(file) shutil.copy(file_name, "output.epub") with ZipFile("output.epub", 'a') as zip: with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub. for file in files_to_zip: zip.write(f"work/{file}", file) shutil.rmtree("work") shutil.rmtree(f"{files_to_work_on[2].split('/')[0]}")