Updates
This commit is contained in:
parent
b02a6849f7
commit
a60f012085
15
cook.py
15
cook.py
|
@ -47,19 +47,20 @@ class NoStdStreams(object):
|
||||||
|
|
||||||
with ZipFile(file_name, 'r') as zip:
|
with ZipFile(file_name, 'r') as zip:
|
||||||
for filename in zip.namelist():
|
for filename in zip.namelist():
|
||||||
if re.search('.*?(?!titlepage).*?\.x?html?$', filename):
|
if re.search('\.x?html?$', filename):
|
||||||
|
if not re.search('titlepage\.x?html?$', filename):
|
||||||
zip.extract(filename)
|
zip.extract(filename)
|
||||||
files_to_work_on.append(filename)
|
files_to_work_on.append(filename)
|
||||||
|
|
||||||
Path("work").mkdir(parents=True, exist_ok=True)
|
Path("work").mkdir(parents=True, exist_ok=True)
|
||||||
Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)
|
Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
print(f"Cooking on {file_name}, with {len(files_to_work_on)} files in it")
|
print(f"Cooking on {file_name}, with {len(files_to_work_on)} (x)html files in it")
|
||||||
|
|
||||||
for file in progressbar(files_to_work_on, "", 40):
|
for file in progressbar(files_to_work_on, "", 40):
|
||||||
with open(file, 'r') as epub_file:
|
with open(file, 'r') as epub_file:
|
||||||
text = epub_file.read()
|
text = epub_file.read()
|
||||||
test = re.findall('id="toc"|epub:type=".*?toc.*?"|epub:type=".*?frontmatter.*?"|epub:type=".*?backmatter.*?"|epub:type=".*?appendix.*?"', text)
|
test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
|
||||||
if test:
|
if test:
|
||||||
print(f"Skipping {file}, it seems to not be a chapter{' '*42}")
|
print(f"Skipping {file}, it seems to not be a chapter{' '*42}")
|
||||||
continue
|
continue
|
||||||
|
@ -70,13 +71,15 @@ for file in progressbar(files_to_work_on, "", 40):
|
||||||
for dd in files_to_work_on:
|
for dd in files_to_work_on:
|
||||||
if re.search(f".*?{match[0]}.*?", dd):
|
if re.search(f".*?{match[0]}.*?", dd):
|
||||||
with open(dd, 'r') as source:
|
with open(dd, 'r') as source:
|
||||||
test = re.findall('id="toc"|epub:type=".*?toc.*?"|epub:type=".*?frontmatter.*?"|epub:type=".*?backmatter.*?"|epub:type=".*?appendix.*?"', text)
|
source_text = source.read()
|
||||||
|
test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
|
||||||
if test:
|
if test:
|
||||||
continue
|
continue
|
||||||
source_match = re.search(f"<p.*?id=\"{match[1]}\".*?</p>", source.read())
|
source_match = re.search(f"<p.*?id=\"{match[1]}\".*?</p>", source_text)
|
||||||
if source_match:
|
if source_match:
|
||||||
|
fixed_text = re.sub('<a id=".*?"(?:><a)?\/>', '', text) # These are stupid, we can just as well get rid of them. Sub below also catches them.
|
||||||
|
fixed_text = re.sub(f'<a(?! epub:type="noteref")(?!.*?\/>).*?href="{match[0]}#{match[1]}".*?>', fr'<a epub:type="noteref" href="#{match[1]}">', fixed_text)
|
||||||
source_match_fixed = re.sub('<p.*?><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p epub:type="footnote" id="{match[1]}">\1</p>', source_match.group())
|
source_match_fixed = re.sub('<p.*?><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p epub:type="footnote" id="{match[1]}">\1</p>', source_match.group())
|
||||||
fixed_text = re.sub(f'<a(?! epub:type="noteref").*?href="{match[0]}#{match[1]}".*?>', fr'<a epub:type="noteref" href="#{match[1]}">', text)
|
|
||||||
text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
|
text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
|
||||||
with open(f"work/{file}", 'w') as output:
|
with open(f"work/{file}", 'w') as output:
|
||||||
output.write(text)
|
output.write(text)
|
||||||
|
|
Loading…
Reference in a new issue