From a60f01208523521721ff90d5980d0f777679f158 Mon Sep 17 00:00:00 2001 From: HackerNCoder Date: Sun, 3 Mar 2024 01:38:31 +0100 Subject: [PATCH] Updates --- cook.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cook.py b/cook.py index ae4b50e..f94ec94 100644 --- a/cook.py +++ b/cook.py @@ -47,19 +47,20 @@ class NoStdStreams(object): with ZipFile(file_name, 'r') as zip: for filename in zip.namelist(): - if re.search('.*?(?!titlepage).*?\.x?html?$', filename): - zip.extract(filename) - files_to_work_on.append(filename) + if re.search('\.x?html?$', filename): + if not re.search('titlepage\.x?html?$', filename): + zip.extract(filename) + files_to_work_on.append(filename) Path("work").mkdir(parents=True, exist_ok=True) Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True) -print(f"Cooking on {file_name}, with {len(files_to_work_on)} files in it") +print(f"Cooking on {file_name}, with {len(files_to_work_on)} (x)html files in it") for file in progressbar(files_to_work_on, "", 40): with open(file, 'r') as epub_file: text = epub_file.read() - test = re.findall('id="toc"|epub:type=".*?toc.*?"|epub:type=".*?frontmatter.*?"|epub:type=".*?backmatter.*?"|epub:type=".*?appendix.*?"', text) + test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text) if test: print(f"Skipping {file}, it seems to not be a chapter{' '*42}") continue @@ -70,13 +71,15 @@ for file in progressbar(files_to_work_on, "", 40): for dd in files_to_work_on: if re.search(f".*?{match[0]}.*?", dd): with open(dd, 'r') as source: - test = re.findall('id="toc"|epub:type=".*?toc.*?"|epub:type=".*?frontmatter.*?"|epub:type=".*?backmatter.*?"|epub:type=".*?appendix.*?"', text) + source_text = source.read() + test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text) if test: continue - source_match = re.search(f"", source.read()) + source_match = re.search(f"", source_text) if source_match: + fixed_text = re.sub('', '', text) # These are stupid, we can just as well get rid of them. Sub below also catches them. + fixed_text = re.sub(f').*?href="{match[0]}#{match[1]}".*?>', fr'', fixed_text) source_match_fixed = re.sub('[\.\s ]*(.*?)<\/p>', rf'

\1

', source_match.group()) - fixed_text = re.sub(f'', fr'
', text) text = re.sub(f"\n\s*", f"\n{source_match_fixed}\n", fixed_text) with open(f"work/{file}", 'w') as output: output.write(text)