97 lines
4.6 KiB
Python
97 lines
4.6 KiB
Python
from zipfile import ZipFile
|
||
from pathlib import Path
|
||
import os, sys, re, shutil, argparse
|
||
|
||
parser = argparse.ArgumentParser("epub cooking")
|
||
#parser.add_argument("--test", help="Test a link or footnote against the regex. (NotImplemented)", type=bool)
|
||
parser.add_argument("file", help="The file to be worked on.", type=str)
|
||
args = parser.parse_args()
|
||
|
||
file_name = args.file
|
||
files_to_work_on = []
|
||
files_to_zip = []
|
||
lastItemLength = 1
|
||
|
||
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
|
||
count = len(it)
|
||
def show(j, item):
|
||
global lastItemLength
|
||
x = int(size*j/count)
|
||
spacesToPrint = lastItemLength - len(item)
|
||
|
||
|
||
print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} (Currently working on: {item}){' '*spacesToPrint}", end='\r', file=out, flush=True)
|
||
lastItemLength = len(item)
|
||
for i, item in enumerate(it):
|
||
show(i+1, item)
|
||
yield item
|
||
print(f"{prefix}[{u'█'*size}] {count}/{count} {' '*lastItemLength}", end='\r', file=out, flush=True)
|
||
print("\n", flush=True, file=out)
|
||
|
||
class NoStdStreams(object):
|
||
def __init__(self,stdout = None, stderr = None):
|
||
self.devnull = open(os.devnull,'w')
|
||
self._stdout = stdout or self.devnull or sys.stdout
|
||
self._stderr = stderr or self.devnull or sys.stderr
|
||
|
||
def __enter__(self):
|
||
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
|
||
self.old_stdout.flush(); self.old_stderr.flush()
|
||
sys.stdout, sys.stderr = self._stdout, self._stderr
|
||
|
||
def __exit__(self, exc_type, exc_value, traceback):
|
||
self._stdout.flush(); self._stderr.flush()
|
||
sys.stdout = self.old_stdout
|
||
sys.stderr = self.old_stderr
|
||
self.devnull.close()
|
||
|
||
with ZipFile(file_name, 'r') as zip:
|
||
for filename in zip.namelist():
|
||
if re.search('\.x?html?$', filename):
|
||
if not re.search('titlepage\.x?html?$', filename):
|
||
zip.extract(filename)
|
||
files_to_work_on.append(filename)
|
||
|
||
Path("work").mkdir(parents=True, exist_ok=True)
|
||
Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"Cooking on {file_name}, with {len(files_to_work_on)} (x)html files in it")
|
||
|
||
for file in progressbar(files_to_work_on, "", 40):
|
||
with open(file, 'r') as epub_file:
|
||
text = epub_file.read()
|
||
test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
|
||
if test:
|
||
print(f"Skipping {file}, it seems to not be a chapter{' '*42}")
|
||
continue
|
||
matches = re.findall(f'<a[\w\d\s="-]*?href="(.*?)#(.*?)"[\w\d\s="-]*?>', text)
|
||
if matches:
|
||
for match in matches:
|
||
if match[0] != '' and not re.search("toc\.x?html?", match[0]):
|
||
for dd in files_to_work_on:
|
||
if re.search(f".*?{match[0]}.*?", dd):
|
||
with open(dd, 'r') as source:
|
||
source_text = source.read()
|
||
test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
|
||
if test:
|
||
continue
|
||
source_match = re.search(f"<p.*?id=\"{match[1]}\".*?</p>", source_text)
|
||
if source_match:
|
||
fixed_text = re.sub('<a id=".*?"(?:><a)?\/>', '', text) # These are stupid, we can just as well get rid of them. Sub below also catches them.
|
||
fixed_text = re.sub(f'<a(?! epub:type="noteref")(?!.*?\/>).*?href="{match[0]}#{match[1]}".*?>', fr'<a epub:type="noteref" href="#{match[1]}">', fixed_text)
|
||
source_match_fixed = re.sub('<p.*?><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p epub:type="footnote" id="{match[1]}">\1</p>', source_match.group())
|
||
text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
|
||
with open(f"work/{file}", 'w') as output:
|
||
output.write(text)
|
||
files_to_zip.append(file)
|
||
|
||
shutil.copy(file_name, "output.epub")
|
||
|
||
with ZipFile("output.epub", 'a') as zip:
|
||
with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
|
||
for file in files_to_zip:
|
||
zip.write(f"work/{file}", file)
|
||
|
||
shutil.rmtree("work")
|
||
shutil.rmtree(f"{files_to_work_on[2].split('/')[0]}")
|