diff --git a/linkToPDF.py b/linkToPDF.py index d96e213de5e687f14cadf76336372b0b721cd4eb..3f36a48f576f119544a78a6d77a42ad0a022a86e 100644 --- a/linkToPDF.py +++ b/linkToPDF.py @@ -1,12 +1,13 @@ import datetime import os +import subprocess import requests from tqdm import tqdm PROTOCOL_PAD_URL = "https://pad.finf.uni-hannover.de/protokolllinks_terces" NOT_APPROVED_SIGN = "*" -ATTACHEMENT_SIGN = "+" +ATTACHMENT_SIGN = "+" FACHGRUPPENVOLLVERSAMMLUNG_SIGN = ">" DEFAULT_TYPE = "finf" @@ -14,7 +15,6 @@ OVERRIDE_ACCEPTED = None FOLDER_RAW = "data/raw/" FOLDER_RESULT = "data/result/" - FOLDER_TEMPLATES = "templates/" # Hard coded URL -> Date Mapping in case of invalid or wrong URL. Should be parsed from pad in later versions @@ -25,70 +25,68 @@ URL_DATE_MAPPING = { } -def downloadPad(url: str) -> str: +def download_pad(url: str) -> str: if "/p/" in url: url += ".txt" else: url += "/download" request = requests.get(url) - return request.content.decode("utf-8") + return request.text -def getPadsFromCollectionPad(padURL: str) -> list: +def get_pads_from_collection_pad(pad_url: str) -> list: """ - Parsing the protocollink-Pad for urls. + Parsing the protocol-link-Pad for urls. """ - pads = list() - if padURL is None: + pads = [] + if not pad_url: return pads - for line in downloadPad(padURL).split("\n"): - link, *comment = line.split("#") - if not link.strip(): + for line in download_pad(pad_url).split("\n"): + link = line.split("#")[0].strip() + if not link: continue prefixes, link = link.split("https://") - link = "https://" + link - link = link.strip() - pads.append(Pad.fromURL(link, prefixes=prefixes)) + pads.append(Pad.from_url("https://" + link, prefixes=prefixes)) return pads class Pad: - def __init__(self, url: str, name, metadata=None, type_=None, approved=False, attachement=False, + def __init__(self, url: str, name, metadata=None, type_=None, approved=False, attachment=False, fachgruppenvollversammlung=False): self.url = url self.id_ = url.split("/")[-1].split("?")[0] self.name = name self.approved = approved - self.attachement = attachement + self.attachment = attachment self.fachgruppenvollversammlung = fachgruppenvollversammlung self.metadata = metadata or dict() - self.type_ = type_ or ("attachement" if self.isAttachement() else DEFAULT_TYPE) + self.type_ = type_ or ("attachment" if self.is_attachment() else DEFAULT_TYPE) self.date = datetime.datetime.strptime(metadata["datum"], "%d.%m.%Y") # print(date) # self.name = f"protokoll.{date.strftime('%Y%m%d')}" - self.raw = f'{"anlagen/" if self.attachement else ""}{self.name}.md' - self.outfile = f'{"anlagen/" if self.attachement else ""}{self.name}.pdf' + self.raw = f'{"anlagen/" if self.attachment else ""}{self.name}.md' + self.outfile = f'{"anlagen/" if self.attachment else ""}{self.name}.pdf' def __str__(self): return f"{self.name} ({self.url})" @staticmethod - def fromURL(url, prefixes=""): + def from_url(url, prefixes=""): """ Parses Protocol-metadata from url """ - def getDate(dateStr, link): + def get_date(date_str, link): """ Helper to check whether url is in url/date-dict""" if link in URL_DATE_MAPPING: return URL_DATE_MAPPING[link] - return datetime.datetime.strptime(dateStr, "%Y%m%d") + return datetime.datetime.strptime(date_str, "%Y%m%d") - isAttachement = ATTACHEMENT_SIGN in prefixes - isFachgruppenvollversammlung = FACHGRUPPENVOLLVERSAMMLUNG_SIGN in prefixes - isNotApproved = NOT_APPROVED_SIGN in prefixes + is_attachment = ATTACHMENT_SIGN in prefixes + is_fachgruppenvollversammlung = FACHGRUPPENVOLLVERSAMMLUNG_SIGN in prefixes + is_not_approved = NOT_APPROVED_SIGN in prefixes url = url.split("?")[0] type_ = None @@ -96,18 +94,18 @@ class Pad: if "/p/" in url: # Old pad url format https://pad.finf.uni-hannover.de/p/protokoll20180508eptnqw id_ = url.split("/")[-1] - dateStr = id_[:17].lstrip("protokoll") - date = getDate(dateStr, url) + date_str = id_[:17].lstrip("protokoll") + date = get_date(date_str, url) name = f'Protokoll {DEFAULT_TYPE.lower()} {date.strftime("%Y-%m-%d")}' metadata["datum"] = date.strftime("%d.%m.%Y") else: id_ = url.split("/")[-1] - if isAttachement: + if is_attachment: try: - _, dateStr, version, *name = id_.split("_") + _, date_str, version, *name = id_.split("_") name = f"{' '.join(name).title()} {version}" - date = getDate(dateStr, url) + date = get_date(date_str, url) metadata["datum"] = date.strftime("%d.%m.%Y") metadata["version"] = version @@ -115,45 +113,45 @@ class Pad: name = id_.title() else: try: - type_, dateStr = id_.split("_")[0].split("protokoll") - date = getDate(dateStr, url) + type_, date_str = id_.split("_")[0].split("protokoll") + date = get_date(date_str, url) except: - date = getDate("", url) + date = get_date("", url) type_ = type_ or DEFAULT_TYPE metadata["datum"] = date.strftime("%d.%m.%Y") name = f'Protokoll {type_.lower()} {date.strftime("%Y-%m-%d")}' - if isFachgruppenvollversammlung: + if is_fachgruppenvollversammlung: name += " - Fachgruppenvollversammlung" - return Pad(url, name, metadata=metadata, type_=type_, approved=not isNotApproved, attachement=isAttachement, - fachgruppenvollversammlung=isFachgruppenvollversammlung) + return Pad(url, name, metadata=metadata, type_=type_, approved=not is_not_approved, attachment=is_attachment, + fachgruppenvollversammlung=is_fachgruppenvollversammlung) - def isAttachement(self): - return self.attachement + def is_attachment(self): + return self.attachment - def isOldPad(self): + def is_old_pad(self): return "/p/" in self.url - def getRawFilename(self): + def get_raw_filename(self): return FOLDER_RAW + self.raw - def getOutFilename(self): + def get_out_filename(self): return FOLDER_RESULT + self.outfile def download(self): """ Downloads pad and saves content in file (self.raw)""" global OVERRIDE_ACCEPTED - content = downloadPad(self.url) - os.makedirs(self.getRawFilename().rsplit("/", maxsplit=1)[0], exist_ok=True) - if not OVERRIDE_ACCEPTED and self.isOldPad() and self.inCache(): - if OVERRIDE_ACCEPTED == False: # and not None + content = download_pad(self.url) + os.makedirs(self.get_raw_filename().rsplit("/", maxsplit=1)[0], exist_ok=True) + if not OVERRIDE_ACCEPTED and self.is_old_pad() and self.in_cache(): + if OVERRIDE_ACCEPTED is False: # and not None return - with open(self.getRawFilename(), "r", encoding="utf-8") as d: + with open(self.get_raw_filename(), "r") as d: if d.read() != content: while OVERRIDE_ACCEPTED is None: - in_ = input(f"Override exisiting old pad [{str(self)}]? (yes/no/all/never)\n") + in_ = input(f"Override existing old pad [{str(self)}]? (yes/no/all/never)\n") if in_ in ["y", "yes"]: break elif in_ in ["n", "no"]: @@ -163,32 +161,42 @@ class Pad: elif in_ in ["all"]: OVERRIDE_ACCEPTED = True - with open(self.getRawFilename(), "w", encoding="utf-8") as d: + with open(self.get_raw_filename(), "w") as d: d.write(content) - def inCache(self): + def in_cache(self): """ Checks whether file already downloaded """ - return os.path.exists(self.getRawFilename()) + return os.path.exists(self.get_raw_filename()) - def getContent(self): - if not self.inCache(): + def get_content(self): + if not self.in_cache(): self.download() - with open(self.getRawFilename(), "r", encoding="utf-8") as d: + with open(self.get_raw_filename(), "r") as d: return d.read() def convert(self): """ Executes command to convert md to pdf """ - if not self.inCache(): + if not self.in_cache(): raise ValueError("Need to download first") - os.makedirs(self.getOutFilename().rsplit("/", maxsplit=1)[0], exist_ok=True) - metadataStr = " ".join([f'-M {key}="{val}"' for key, val in self.metadata.items()]) - cmd = f'pandoc "{self.getRawFilename()}" {metadataStr} --template="{FOLDER_TEMPLATES}pandoc.{self.type_}.tex" -o "{self.getOutFilename()}"' - os.system(cmd) - - -def downloadPads(pads): + os.makedirs(self.get_out_filename().rsplit("/", maxsplit=1)[0], exist_ok=True) + metadata = [ + x + for k, v in self.metadata.items() + for x in ("-M", f'{k}={v}') + ] + subprocess.run([ + "pandoc", + f"{self.get_raw_filename()}", + *metadata, + f"--template={FOLDER_TEMPLATES}pandoc.{self.type_}.tex", + "-o", + f"{self.get_out_filename()}" + ]) + + +def download_pads(pads): for pad in tqdm(pads, desc="Downloading"): try: pad.download() @@ -196,7 +204,7 @@ def downloadPads(pads): print(f"Error while downloading {pad}:\n{str(e)}") -def compilePads(pads): +def compile_pads(pads): for pad in tqdm(pads, desc="Converting "): assert isinstance(pad, Pad) try: @@ -205,24 +213,29 @@ def compilePads(pads): print(f"Error while converting \"{pad}\":\n{str(e)}") -def allPads(pads, topsOnly=False): - mergedFilename = "merged_topics.md" - with open(mergedFilename, "w", encoding="utf-8") as d: +def all_pads(pads, tops_only=False): + merged_filename = "merged_topics.md" + with open(merged_filename, "w") as d: for pad in tqdm(sorted(pads, key=lambda p: p.date), desc="Create Topics Document"): assert isinstance(pad, Pad) - content = pad.getContent() - if topsOnly: + content = pad.get_content() + if tops_only: for line in content.split("\n"): if line.strip().startswith("#"): d.write(f"{line}\n.\n\n") else: d.write(f"{content}\n") - metaDataStr = "" - cmd = f'pandoc "{mergedFilename}" {metaDataStr} --template="{FOLDER_TEMPLATES}pandoc.topics.tex" -o "data/Alle.pdf"' - retCode = os.system(cmd) - if retCode == 0: - os.remove(mergedFilename) + try: + subprocess.run([ + "pandoc", + f"{merged_filename}", + f"--template={FOLDER_TEMPLATES}pandoc.topics.tex", + "-o", + "data/Alle.pdf" + ]) + except subprocess.CalledProcessError: + os.remove(merged_filename) if __name__ == '__main__': @@ -237,13 +250,13 @@ if __name__ == '__main__': help=f"Add additional information to pad:\n" f"Not approved: {NOT_APPROVED_SIGN}\n" f"Fachschaftenvollversammlung: {FACHGRUPPENVOLLVERSAMMLUNG_SIGN}\n" - f"Attachement: {ATTACHEMENT_SIGN}\n") + f"Attachment: {ATTACHMENT_SIGN}\n") parser.add_argument("--finf", action="store_true") parser.add_argument("--fsr", action="store_true") parser.add_argument("--download", action="store_true") parser.add_argument("--ignoreOld", action="store_true") parser.add_argument("--compile", action="store_true") - parser.add_argument("--attachementsOnly", action="store_true") + parser.add_argument("--attachmentsOnly", action="store_true") parser.add_argument("--newOnly", action="store_true") parser.add_argument("--override", action="store_true") parser.add_argument("--mail", action="store_true") @@ -266,21 +279,21 @@ if __name__ == '__main__': if parsed.override: OVERRIDE_ACCEPTED = True - pads = getPadsFromCollectionPad(parsed.pad) - pads += [Pad.fromURL(url, prefixes=parsed.prefixes) for url in urls] + pads = get_pads_from_collection_pad(parsed.pad) + pads += [Pad.from_url(url, prefixes=parsed.prefixes) for url in urls] if parsed.newOnly: pads = [pad for pad in pads if not pad.approved] if parsed.ignoreOld: - pads = [pad for pad in pads if not pad.isOldPad()] - if parsed.attachementsOnly: - pads = [pad for pad in pads if pad.isAttachement()] + pads = [pad for pad in pads if not pad.is_old_pad()] + if parsed.attachmentsOnly: + pads = [pad for pad in pads if pad.is_attachment()] if parsed.download: - downloadPads(pads) + download_pads(pads) if parsed.topics: - allPads(pads, topsOnly=not parsed.allContent) + all_pads(pads, tops_only=not parsed.allContent) if parsed.compile: - compilePads(pads) + compile_pads(pads)