cook.py: Work with more books
This commit is contained in:
parent
cdeffb3e86
commit
341bc5879b
70
cook.py
70
cook.py
|
@ -1,36 +1,71 @@
|
||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys, re, shutil, subprocess
|
import os, sys, time, re, shutil, subprocess
|
||||||
|
|
||||||
file_name = sys.argv[1]
|
file_name = sys.argv[1]
|
||||||
files_to_work_on = []
|
files_to_work_on = []
|
||||||
files_to_zip = []
|
files_to_zip = []
|
||||||
files_to_dl = []
|
|
||||||
|
def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
|
||||||
|
count = len(it)
|
||||||
|
start = time.time()
|
||||||
|
def show(j):
|
||||||
|
x = int(size*j/count)
|
||||||
|
|
||||||
|
print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count}", end='\r', file=out, flush=True)
|
||||||
|
|
||||||
|
for i, item in enumerate(it):
|
||||||
|
yield item
|
||||||
|
show(i+1)
|
||||||
|
print("\n", flush=True, file=out)
|
||||||
|
|
||||||
|
class NoStdStreams(object):
|
||||||
|
def __init__(self,stdout = None, stderr = None):
|
||||||
|
self.devnull = open(os.devnull,'w')
|
||||||
|
self._stdout = stdout or self.devnull or sys.stdout
|
||||||
|
self._stderr = stderr or self.devnull or sys.stderr
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
|
||||||
|
self.old_stdout.flush(); self.old_stderr.flush()
|
||||||
|
sys.stdout, sys.stderr = self._stdout, self._stderr
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_value, traceback):
|
||||||
|
self._stdout.flush(); self._stderr.flush()
|
||||||
|
sys.stdout = self.old_stdout
|
||||||
|
sys.stderr = self.old_stderr
|
||||||
|
self.devnull.close()
|
||||||
|
|
||||||
with ZipFile(file_name, 'r') as zip:
|
with ZipFile(file_name, 'r') as zip:
|
||||||
for filename in zip.namelist():
|
for filename in zip.namelist():
|
||||||
if filename.startswith("text/"):
|
if re.search('.x?html?', filename):
|
||||||
if filename != "text/":
|
|
||||||
zip.extract(filename)
|
zip.extract(filename)
|
||||||
files_to_work_on.append(filename)
|
files_to_work_on.append(filename)
|
||||||
|
|
||||||
print(files_to_work_on)
|
Path("work").mkdir(parents=True, exist_ok=True)
|
||||||
|
Path(f"work/{files_to_work_on[0].split('/')[0]}").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
for file in files_to_work_on:
|
print(f"Cooking on {file_name}, with {len(files_to_work_on)} files in it")
|
||||||
|
|
||||||
|
for file in progressbar(files_to_work_on, "", 40):
|
||||||
with open(file, 'r') as epub_file:
|
with open(file, 'r') as epub_file:
|
||||||
text = epub_file.read()
|
text = epub_file.read()
|
||||||
matches = re.findall("<a class=\"hlink\" id=\"(.*?)\" href=\"(.*?)#(.*?)\">", text)
|
test = re.findall('id="toc"', text)
|
||||||
|
if test:
|
||||||
|
continue
|
||||||
|
matches = re.findall('<a class="hlink"(?:(?:href="(.*?)#(.*?)")*.*?)+>', text)
|
||||||
if matches:
|
if matches:
|
||||||
for match in matches:
|
for match in matches:
|
||||||
match_dir = "text/" + match[1]
|
if match[0] != '':
|
||||||
with open(match_dir, 'r') as source:
|
for dd in files_to_work_on:
|
||||||
source_match = re.search(f"<p class=\"block\" id=\"{match[2]}\">.*?</p>", source.read())
|
if re.search(f".*?{match[0]}.*?", dd):
|
||||||
source_match_fixed = re.sub('<p(.*?)><a.*?</a> (.*?)</p>', r'<p\1>\2</p>', source_match.group())
|
with open(dd, 'r') as source:
|
||||||
|
source_match = re.search(f"<p class=\"hanging1\".*?id=\"{match[1]}\">.*?</p>", source.read()) # VHS: Change hanging1
|
||||||
|
if source_match:
|
||||||
|
source_match_fixed = re.sub('<p(?:(.*?)?(?:id=".*?")?)><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p\1 id={match[1]}>\2</p>', source_match.group())
|
||||||
source_match_fixed = re.sub('<p (.*?)</p>', r'<p epub:type="footnote" \1</p>', source_match_fixed)
|
source_match_fixed = re.sub('<p (.*?)</p>', r'<p epub:type="footnote" \1</p>', source_match_fixed)
|
||||||
fixed_text = re.sub(f"<a class=\"hlink\" id=\"{match[0]}\" href=\".*?#.*?\">", f"<a epub:type=\"noteref\" class=\"hlink\" href=\"#{match[2]}\">", text)
|
fixed_text = re.sub(f"<a class=\"hlink\" ((?:id=\"{match[0]}\")*(?:href=\".*?#.*?\")*.*?)*>", f"<a epub:type=\"noteref\" class=\"hlink\" href=\"#{match[1]}\">", text)
|
||||||
text = re.sub(f"\n\s*</body></html>", f"\n{source_match_fixed}\n</body></html>", fixed_text)
|
text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
|
||||||
if not match_dir in files_to_dl:
|
|
||||||
files_to_dl.append(match_dir)
|
|
||||||
with open(f"work/{file}", 'w') as output:
|
with open(f"work/{file}", 'w') as output:
|
||||||
output.write(text)
|
output.write(text)
|
||||||
files_to_zip.append(file)
|
files_to_zip.append(file)
|
||||||
|
@ -38,7 +73,8 @@ for file in files_to_work_on:
|
||||||
shutil.copy(file_name, "output.epub")
|
shutil.copy(file_name, "output.epub")
|
||||||
|
|
||||||
with ZipFile("output.epub", 'a') as zip:
|
with ZipFile("output.epub", 'a') as zip:
|
||||||
|
with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
|
||||||
for file in files_to_zip:
|
for file in files_to_zip:
|
||||||
zip.write(f"work/{file}", file)
|
zip.write(f"work/{file}", file)
|
||||||
#cmd=['zip', '-d', "output.epub"] + files_to_dl
|
|
||||||
#subprocess.check_call(cmd)
|
# shutil.rmtree("work")
|
||||||
|
|
Loading…
Reference in a new issue