stuff/cook.py

from zipfile import ZipFile
from pathlib import Path
import os, sys, re, shutil, argparse

parser = argparse.ArgumentParser("epub cooking")
#parser.add_argument("--test", help="Test a link or footnote against the regex. (NotImplemented)", type=bool)
parser.add_argument("file", help="The file to be worked on.", type=str)
args = parser.parse_args()

file_name = args.file
files_to_work_on = []
files_to_zip = []
lastItemLength = 1

def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
    count = len(it)
    def show(j, item):
        global lastItemLength
        x = int(size*j/count)
        spacesToPrint = lastItemLength - len(item)


        print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} (Currently working on: {item}){' '*spacesToPrint}", end='\r', file=out, flush=True)
        lastItemLength = len(item)
    for i, item in enumerate(it):
        show(i+1, item)
        yield item
    print(f"{prefix}[{u'█'*size}] {count}/{count}                         {' '*lastItemLength}", end='\r', file=out, flush=True)
    print("\n", flush=True, file=out)

class NoStdStreams(object):
    def __init__(self,stdout = None, stderr = None):
        self.devnull = open(os.devnull,'w')
        self._stdout = stdout or self.devnull or sys.stdout
        self._stderr = stderr or self.devnull or sys.stderr

    def __enter__(self):
        self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
        self.old_stdout.flush(); self.old_stderr.flush()
        sys.stdout, sys.stderr = self._stdout, self._stderr

    def __exit__(self, exc_type, exc_value, traceback):
        self._stdout.flush(); self._stderr.flush()
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr
        self.devnull.close()

with ZipFile(file_name, 'r') as zip:
    for filename in zip.namelist():
        if re.search('\.x?html?$', filename):
            if not re.search('titlepage\.x?html?$', filename):
                zip.extract(filename)
                files_to_work_on.append(filename)

Path("work").mkdir(parents=True, exist_ok=True)
Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)

print(f"Cooking on {file_name}, with {len(files_to_work_on)} (x)html files in it")

for file in progressbar(files_to_work_on, "", 40):
    with open(file, 'r') as epub_file:
        text = epub_file.read()
        test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
        if test:
            print(f"Skipping {file}, it seems to not be a chapter{' '*42}")
            continue
        matches = re.findall(f'<a[\w\d\s="-]*?href="(.*?)#(.*?)"[\w\d\s="-]*?>', text)
        if matches:
            for match in matches:
                if match[0] != '' and not re.search("toc\.x?html?", match[0]):
                    for dd in files_to_work_on:
                        if re.search(f".*?{match[0]}.*?", dd):
                            with open(dd, 'r') as source:
                                source_text = source.read()
                                test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
                                if test:
                                    continue
                                source_match = re.search(f"<p.*?id=\"{match[1]}\".*?</p>", source_text)
                                if source_match:
                                    fixed_text = re.sub('<a id=".*?"(?:><a)?\/>', '', text) # These are stupid, we can just as well get rid of them. Sub below also catches them.
                                    fixed_text = re.sub(f'<a(?! epub:type="noteref")(?!.*?\/>).*?href="{match[0]}#{match[1]}".*?>', fr'<a epub:type="noteref" href="#{match[1]}">', fixed_text)
                                    source_match_fixed = re.sub('<p.*?><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p epub:type="footnote" id="{match[1]}">\1</p>', source_match.group())
                                    text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
            with open(f"work/{file}", 'w') as output:
                output.write(text)
            files_to_zip.append(file)

shutil.copy(file_name, "output.epub")

with ZipFile("output.epub", 'a') as zip:
    with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
        for file in files_to_zip:
            zip.write(f"work/{file}", file)

shutil.rmtree("work")
shutil.rmtree(f"{files_to_work_on[2].split('/')[0]}")
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
+								from zipfile import ZipFile
 								from pathlib import Path
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								import os, sys, re, shutil, argparse
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
-												cook: More compat. Add class args

											
										
										
											2024-02-23 00:16:45 +00:00
+								parser = argparse.ArgumentParser("epub cooking")
 								#parser.add_argument("--test", help="Test a link or footnote against the regex. (NotImplemented)", type=bool)
 								parser.add_argument("file", help="The file to be worked on.", type=str)
 								args = parser.parse_args()
 								file_name = args.file
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
+								files_to_work_on = []
 								files_to_zip = []
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								lastItemLength = 1
-												cook: More compat. Add class args

											
										
										
											2024-02-23 00:16:45 +00:00
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								def progressbar(it, prefix="", size=60, out=sys.stdout): # Python3.6+
 								    count = len(it)
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								    def show(j, item):
 								        global lastItemLength
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								        x = int(size*j/count)
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								        spacesToPrint = lastItemLength - len(item)
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								        print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} (Currently working on: {item}){' '*spacesToPrint}", end='\r', file=out, flush=True)
 								        lastItemLength = len(item)
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								    for i, item in enumerate(it):
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								        show(i+1, item)
-												cook: More stuff

											
										
										
											2024-02-23 02:14:00 +00:00
+								        yield item
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								    print(f"{prefix}[{u'█'*size}] {count}/{count}                         {' '*lastItemLength}", end='\r', file=out, flush=True)
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								    print("\n", flush=True, file=out)
 								class NoStdStreams(object):
 								    def __init__(self,stdout = None, stderr = None):
 								        self.devnull = open(os.devnull,'w')
 								        self._stdout = stdout or self.devnull or sys.stdout
 								        self._stderr = stderr or self.devnull or sys.stderr
 								    def __enter__(self):
 								        self.old_stdout, self.old_stderr = sys.stdout, sys.stderr
 								        self.old_stdout.flush(); self.old_stderr.flush()
 								        sys.stdout, sys.stderr = self._stdout, self._stderr
 								    def __exit__(self, exc_type, exc_value, traceback):
 								        self._stdout.flush(); self._stderr.flush()
 								        sys.stdout = self.old_stdout
 								        sys.stderr = self.old_stderr
 								        self.devnull.close()
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
 								with ZipFile(file_name, 'r') as zip:
 								    for filename in zip.namelist():
-												Updates

											
										
										
											2024-03-03 00:38:31 +00:00
+								        if re.search('\.x?html?$', filename):
 								            if not re.search('titlepage\.x?html?$', filename):
 								                zip.extract(filename)
 								                files_to_work_on.append(filename)
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								Path("work").mkdir(parents=True, exist_ok=True)
-												cook: More compat. Add class args

											
										
										
											2024-02-23 00:16:45 +00:00
+								Path(f"work/{files_to_work_on[2].split('/')[0]}").mkdir(parents=True, exist_ok=True)
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
-												Updates

											
										
										
											2024-03-03 00:38:31 +00:00
+								print(f"Cooking on {file_name}, with {len(files_to_work_on)} (x)html files in it")
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								for file in progressbar(files_to_work_on, "", 40):
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
+								    with open(file, 'r') as epub_file:
 								        text = epub_file.read()
-												Updates

											
										
										
											2024-03-03 00:38:31 +00:00
+								        test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								        if test:
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								            print(f"Skipping {file}, it seems to not be a chapter{' '*42}")
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								            continue
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								        matches = re.findall(f'<a[\w\d\s="-]*?href="(.*?)#(.*?)"[\w\d\s="-]*?>', text)
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
+								        if matches:
 								            for match in matches:
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								                if match[0] != '' and not re.search("toc\.x?html?", match[0]):
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								                    for dd in files_to_work_on:
 								                        if re.search(f".*?{match[0]}.*?", dd):
 								                            with open(dd, 'r') as source:
-												Updates

											
										
										
											2024-03-03 00:38:31 +00:00
+								                                source_text = source.read()
 								                                test = re.findall('id="toc"|epub:type="[\w\s]*?toc[\w\s]*?"|epub:type="[\w\s]*?frontmatter[\w\s]*?"|epub:type="[\w\s]*?backmatter[\w\s]*?"|epub:type="[\w\s]*?appendix[\w\s]*?"', text)
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								                                if test:
 								                                    continue
-												Updates

											
										
										
											2024-03-03 00:38:31 +00:00
+								                                source_match = re.search(f"<p.*?id=\"{match[1]}\".*?</p>", source_text)
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								                                if source_match:
-												Updates

											
										
										
											2024-03-03 00:38:31 +00:00
+								                                    fixed_text = re.sub('<a id=".*?"(?:><a)?\/>', '', text) # These are stupid, we can just as well get rid of them. Sub below also catches them.
 								                                    fixed_text = re.sub(f'<a(?! epub:type="noteref")(?!.*?\/>).*?href="{match[0]}#{match[1]}".*?>', fr'<a epub:type="noteref" href="#{match[1]}">', fixed_text)
-												cook: More stuff

											
										
										
											2024-02-23 02:14:00 +00:00
+								                                    source_match_fixed = re.sub('<p.*?><a.*?<\/a>[\.\s ]*(.*?)<\/p>', rf'<p epub:type="footnote" id="{match[1]}">\1</p>', source_match.group())
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								                                    text = re.sub(f"\n\s*</body>", f"\n{source_match_fixed}\n</body>", fixed_text)
-												python script for epub endnote>footnote

											
										
										
											2024-02-21 03:22:19 +00:00
+								            with open(f"work/{file}", 'w') as output:
 								                output.write(text)
 								            files_to_zip.append(file)
 								shutil.copy(file_name, "output.epub")
 								with ZipFile("output.epub", 'a') as zip:
-												cook.py: Work with more books

											
										
										
											2024-02-22 23:33:21 +00:00
+								    with NoStdStreams(): # ZipFile.write will throw a warning about duplicate files, we don't care, we just want it to overwrite those already in the epub.
 								        for file in files_to_zip:
 								            zip.write(f"work/{file}", file)
-												cook: More compat. Add class args

											
										
										
											2024-02-23 00:16:45 +00:00
+								shutil.rmtree("work")
-												cook: Better comp + cli output. Cleanup.

											
										
										
											2024-02-23 01:40:53 +00:00
+								shutil.rmtree(f"{files_to_work_on[2].split('/')[0]}")