Commit 5b17f3ea authored by Dominik Woiwode's avatar Dominik Woiwode
Browse files

Add scrupt to download/compile protocols

parent 96d94b5f
.idea/
data/
\ No newline at end of file
# Protokoll Texer
## linkToPDF.py
### Usage
#### Basic usage:
```
python3 linkToPDF.py <url to pad> --download --compile
```
#### Usage with link-pad:
```
python3 linkToPDF.py --pad <url to pad>
```
#### Advanced usage:
Pad is "Fachschaftenversammlung"
```
python3 linkToPDF.py <url to pad> --prefixes=">"
```
Get all new pads from Link-Pad and download/compile
```
python3 linkToPDF.py --pad <url to pad> --newOnly --download --compile
```
Create merged pad with all protocols
```
python3 linkToPDF.py --pad <url to pad> --topics
```
### Link-Pad
The Link-pad has to have following format:
```
# Der Markdownsyntax wird hier nicht beachtet. "#" dient als Zeilenkommentaranfang.
# Legende:
# * : Muss noch beschlossen werden
# > : Fachschaftenversammlung
# + : Anlage
+https://pad.finf.uni-hannover.de/finfanlage_20190122_v1_finanzkonzept # Dieses Pad ist eine Anlage
*https://pad.finf.uni-hannover.de/finfprotokollYYYYMMDD_SALT # Protokoll noch nicht genehmigt
https://pad.finf.uni-hannover.de/finfprotokollYYYYMMDD_SALT # Protokoll bereits genehmigt
>*https://pad.finf.uni-hannover.de/finfprotokollYYYYMMDD_SALT # Protokoll nicht genehmigt; Ist Fachschaftenversammlung
```
\ No newline at end of file
import datetime
import os
import requests
from tqdm import tqdm
PROTOCOL_PAD_URL = "https://pad.finf.uni-hannover.de/protokolllinks_terces"
NOT_APPROVED_SIGN = "*"
ATTACHEMENT_SIGN = "+"
FACHSCHAFTENVERSAMMLUNG_SIGN = ">"
DEFAULT_TYPE = "finf"
OVERRIDE_ACCEPTED = None
FOLDER_RAW = "data/raw/"
FOLDER_RESULT = "data/result/"
# Hard coded URL -> Date Mapping in case of invalid or wrong URL. Should be parsed from pad in later versions
URL_DATE_MAPPING = {
"https://pad.finf.uni-hannover.de/finfprotokoll20190903_kvakva": datetime.datetime(2019, 9, 4),
"https://pad.finf.uni-hannover.de/finfprotokoll20181129_evkftw": datetime.datetime(2018, 11, 27),
"https://pad.finf.uni-hannover.de/ZjI1MmP_RVOkfuapfGuo1Q": datetime.datetime(2018, 5, 29),
}
def downloadPad(url: str) -> str:
if "/p/" in url:
url += ".txt"
else:
url += "/download"
request = requests.get(url)
return request.content.decode("utf-8")
def getPadsFromCollectionPad(padURL: str) -> list:
"""
Parsing the protocollink-Pad for urls.
"""
pads = list()
if padURL is None:
return pads
for line in downloadPad(padURL).split("\n"):
link, *comment = line.split("#")
if not link.strip():
continue
prefixes, link = link.split("https://")
link = "https://" + link
link = link.strip()
pads.append(Pad.fromURL(link, prefixes=prefixes))
return pads
class Pad:
def __init__(self, url: str, name, metadata=None, type_=None, approved=False, attachement=False,
fachschaftsvollversammlung=False):
self.url = url
self.id_ = url.split("/")[-1].split("?")[0]
self.name = name
self.approved = approved
self.attachement = attachement
self.fachschaftsvollversammlung = fachschaftsvollversammlung
self.metadata = metadata or dict()
self.type_ = type_ or ("attachement" if self.isAttachement() else DEFAULT_TYPE)
self.date = datetime.datetime.strptime(metadata["datum"], "%d.%m.%Y")
# print(date)
# self.name = f"protokoll.{date.strftime('%Y%m%d')}"
self.raw = f'{"anlagen/" if self.attachement else ""}{self.name}.md'
self.outfile = f'{"anlagen/" if self.attachement else ""}{self.name}.pdf'
def __str__(self):
return f"{self.name} ({self.url})"
@staticmethod
def fromURL(url, prefixes=""):
""" Parses Protocol-metadata from url """
def getDate(dateStr, link):
""" Helper to check whether url is in url/date-dict"""
if link in URL_DATE_MAPPING:
return URL_DATE_MAPPING[link]
return datetime.datetime.strptime(dateStr, "%Y%m%d")
isAttachement = ATTACHEMENT_SIGN in prefixes
isFachschaftsvollversammlung = FACHSCHAFTENVERSAMMLUNG_SIGN in prefixes
isNotApproved = NOT_APPROVED_SIGN in prefixes
url = url.split("?")[0]
type_ = None
metadata = dict()
if "/p/" in url:
# Old pad url format https://pad.finf.uni-hannover.de/p/protokoll20180508eptnqw
id_ = url.split("/")[-1]
dateStr = id_[:17].lstrip("protokoll")
date = getDate(dateStr, url)
name = f'Protokoll {DEFAULT_TYPE.lower()} {date.strftime("%Y-%m-%d")}'
metadata["datum"] = date.strftime("%d.%m.%Y")
else:
id_ = url.split("/")[-1]
if isAttachement:
try:
_, dateStr, version, *name = id_.split("_")
name = f"{' '.join(name).title()} {version}"
date = getDate(dateStr, url)
metadata["datum"] = date.strftime("%d.%m.%Y")
metadata["version"] = version
except:
name = id_.title()
else:
try:
type_, dateStr = id_.split("_")[0].split("protokoll")
date = getDate(dateStr, url)
except:
date = getDate("", url)
type_ = type_ or DEFAULT_TYPE
metadata["datum"] = date.strftime("%d.%m.%Y")
name = f'Protokoll {type_.lower()} {date.strftime("%Y-%m-%d")}'
if isFachschaftsvollversammlung:
name += " - Fachschaftsvollversammlung"
return Pad(url, name, metadata=metadata, type_=type_, approved=not isNotApproved, attachement=isAttachement,
fachschaftsvollversammlung=isFachschaftsvollversammlung)
def isAttachement(self):
return self.attachement
def isOldPad(self):
return "/p/" in self.url
def getRawFilename(self):
return FOLDER_RAW + self.raw
def getOutFilename(self):
return FOLDER_RESULT + self.outfile
def download(self):
""" Downloads pad and saves content in file (self.raw)"""
global OVERRIDE_ACCEPTED
content = downloadPad(self.url)
os.makedirs(self.getRawFilename().rsplit("/", maxsplit=1)[0], exist_ok=True)
if not OVERRIDE_ACCEPTED and self.isOldPad() and self.inCache():
if OVERRIDE_ACCEPTED == False: # and not None
return
with open(self.getRawFilename(), "r", encoding="utf-8") as d:
if d.read() != content:
while OVERRIDE_ACCEPTED is None:
in_ = input(f"Override exisiting old pad [{str(self)}]? (yes/no/all/never)\n")
if in_ in ["y", "yes"]:
break
elif in_ in ["n", "no"]:
return
elif in_ in ["never"]:
OVERRIDE_ACCEPTED = False
elif in_ in ["all"]:
OVERRIDE_ACCEPTED = True
with open(self.getRawFilename(), "w", encoding="utf-8") as d:
d.write(content)
def inCache(self):
""" Checks whether file already downloaded """
return os.path.exists(self.getRawFilename())
def getContent(self):
if not self.inCache():
self.download()
with open(self.getRawFilename(), "r") as d:
return d.read()
def convert(self):
""" Executes command to convert md to pdf """
if not self.inCache():
raise ValueError("Need to download first")
os.makedirs(self.getOutFilename().rsplit("/", maxsplit=1)[0], exist_ok=True)
metadataStr = " ".join([f'-M {key}="{val}"' for key, val in self.metadata.items()])
cmd = f'pandoc "{self.getRawFilename()}" {metadataStr} --template="pandoc.{self.type_}.tex" -o "{self.getOutFilename()}"'
os.system(cmd)
def downloadPads(pads):
for pad in tqdm(pads, desc="Downloading"):
try:
pad.download()
except Exception as e:
print(f"Error while downloading {pad}:\n{str(e)}")
def compilePads(pads):
for pad in tqdm(pads, desc="Converting "):
assert isinstance(pad, Pad)
try:
pad.convert()
except Exception as e:
print(f"Error while converting \"{pad}\":\n{str(e)}")
def allPads(pads, topsOnly=False):
mergedFilename = "merged_topics.md"
with open(mergedFilename, "w") as d:
for pad in tqdm(sorted(pads, key=lambda p: p.date), desc="Create Topics Document"):
assert isinstance(pad, Pad)
content = pad.getContent()
if topsOnly:
for line in content.split("\n"):
if line.strip().startswith("#"):
d.write(f"{line}\n.\n\n")
else:
d.write(f"{content}\n")
metaDataStr = ""
cmd = f'pandoc "{mergedFilename}" {metaDataStr} --template="pandoc.topics.tex" -o "data/Alle.pdf"'
retCode = os.system(cmd)
if retCode == 0:
os.remove(mergedFilename)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="Download and convert finf/fsr-protocols using pandoc")
parser.add_argument("urls", nargs="*", type=str)
parser.add_argument("--pad", type=str, default=None)
parser.add_argument("--topics", action="store_true")
parser.add_argument("--allContent", action="store_true")
parser.add_argument("--prefixes", type=str, default="",
help=f"Add additional information to pad:\n"
f"Not approved: {NOT_APPROVED_SIGN}\n"
f"Fachschaftenvollversammlung: {FACHSCHAFTENVERSAMMLUNG_SIGN}\n"
f"Attachement: {ATTACHEMENT_SIGN}\n")
parser.add_argument("--finf", action="store_true")
parser.add_argument("--fsr", action="store_true")
parser.add_argument("--download", action="store_true")
parser.add_argument("--ignoreOld", action="store_true")
parser.add_argument("--compile", action="store_true")
parser.add_argument("--attachementsOnly", action="store_true")
parser.add_argument("--newOnly", action="store_true")
parser.add_argument("--override", action="store_true")
parser.add_argument("--mail", action="store_true")
parsed = parser.parse_args()
urls = parsed.urls
if not urls and parsed.pad is None:
parser.print_usage()
exit()
if parsed.finf and parsed.fsr:
raise AttributeError("Cannot have --finf AND --fsr")
if parsed.finf:
DEFAULT_TYPE = "finf"
if parsed.fsr:
DEFAULT_TYPE = "fsr"
if parsed.override:
OVERRIDE_ACCEPTED = True
pads = getPadsFromCollectionPad(parsed.pad)
pads += [Pad.fromURL(url, prefixes=parsed.prefixes) for url in urls]
if parsed.newOnly:
pads = [pad for pad in pads if not pad.approved]
if parsed.ignoreOld:
pads = [pad for pad in pads if not pad.isOldPad()]
if parsed.attachementsOnly:
pads = [pad for pad in pads if pad.isAttachement()]
if parsed.download:
downloadPads(pads)
if parsed.topics:
allPads(pads, topsOnly=not parsed.allContent)
if parsed.compile:
compilePads(pads)
\documentclass[halfparskip, 10pt, a4paper, DIV17, smallheadings]{scrartcl}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[automark]{scrpage2}
\usepackage{lmodern}
\usepackage{xspace}
\usepackage{ngerman}
\usepackage[dvipsnames]{xcolor}
\usepackage{eurosym}
\usepackage{graphicx}
\usepackage{titlesec}
\usepackage{longtable}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{textcomp}
\makeatletter
\def\ScaleWidthIfNeeded{%
\ifdim\Gin@nat@width>\linewidth
\linewidth
\else
\Gin@nat@width
\fi
}
\def\ScaleHeightIfNeeded{%
\ifdim\Gin@nat@height>0.9\textheight
0.9\textheight
\else
\Gin@nat@width
\fi
}
\makeatother
\setkeys{Gin}{width=\ScaleWidthIfNeeded,height=\ScaleHeightIfNeeded,keepaspectratio}%
\setcounter{secnumdepth}{0}% % Turns off numbering for sections
\titleformat{\section}{\bfseries\Huge}{}{0em}{}
\titleformat{\subsection}[block]{\bfseries\large}{\arabic{subsection}}{0em}{}
%%\titleformat{\subsection}[block]{\bfseries\Large}{}{0em}{}
%\titleformat{\subsubsection}{\bfseries\large}{\arabic{subsection}.\arabic{subsubsection}: }{0em}{}
%\titleformat{\paragraph}{\bfseries}{}{0em}{}
%\DeclareUnicodeCharacter{20AC}{\euro}
%\DeclareUnicodeCharacter{B0}{\textdegree}
% remove pandoc tightlist shit
\def\tightlist{}
\hypersetup{
pdfauthor={FR-Informatik},
pdftitle={$anlage$},
pdfsubject={Anlage FR-Informatik - $anlage$},
pdfkeywords={Informatik, Fachrat, Fachschaft, Hannover, Finf, Anlage, $anlage$},
bookmarks=false,
colorlinks=true,
urlcolor=Blue,
linkcolor=Black,
citecolor=Red,
hyperindex=true
}
\let\oldhref\href
\renewcommand{\href}[2]{\oldhref{#1}{#2}\footnote{\url{#1}}}
\parindent0mm
\begin{document}
\clearscrheadfoot
\pagestyle{scrheadings}
\vspace*{-1cm}
\includegraphics[width=0.15\linewidth]{Finf_Logo.pdf}
\hfill
\includegraphics[width=0.5\linewidth]{LUH_logo.pdf}
\hspace*{-13pt}
\smallskip
\vspace*{20pt}
\begin{footnotesize}
\begin{minipage}{.49\linewidth}
Fachrat Informatik,\, Leibniz Universität Hannover \\
Appelstraße 9a \( \bullet \) 30167 Hannover
\end{minipage}
\begin{minipage}{.50\linewidth}
\begin{flushright}
E-Mail: \oldhref{mailto:fr@finf.uni-hannover.de}{\color{black}{fr@finf-hannover.de}} \\
\oldhref{https://www.finf.uni-hannover.de/}{\color{black}{www.finf.uni-hannover.de}}
\end{flushright}
\end{minipage}
\end{footnotesize}
\setfootsepline{.4pt}
\cfoot{Fachrat Informatik, Leibniz Universität
Hannover \hfill \pagemark \hfill Fachratssitzung Informatik vom $datum$}
\bigskip
\textit{Dokument: $anlage$ $version$ \\Gültig ab $datum$}
$body$
\end{document}
\documentclass[halfparskip, 10pt, a4paper, DIV17, smallheadings]{scrartcl}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage[automark]{scrpage2}
\usepackage{lmodern}
\usepackage{xspace}
\usepackage{ngerman}
\usepackage[dvipsnames]{xcolor}
\usepackage{eurosym}
\usepackage{graphicx}
\usepackage{titlesec}
\usepackage{longtable}
\usepackage{booktabs}
\usepackage{hyperref}
\usepackage{ulem}
\usepackage{textcomp}
%\setcounter{secnumdepth}{0}% % Turns off numbering for sections
\titleformat{\section}{\bfseries\Large}{}{0em}{}
\titleformat{\subsection}[block]{\bfseries\Large}{TOP \arabic{subsection}: }{0em}{}
%\titleformat{\subsection}[block]{\bfseries\Large}{}{0em}{}
\titleformat{\subsubsection}{\bfseries\large}{\arabic{subsection}.\arabic{subsubsection}: }{0em}{}
\titleformat{\paragraph}{\bfseries}{}{0em}{}
%\DeclareUnicodeCharacter{20AC}{\euro}
%\DeclareUnicodeCharacter{B0}{\textdegree}
% remove pandoc tightlist shit
\def\tightlist{}
\hypersetup{
pdfauthor={FR Informatik},
pdftitle={Protokollthemen der Fachratssitzung Informatik},
pdfsubject={Fachratssitzung Informatik - Leibniz Universität Hannover},
pdfkeywords={Informatik, Fachrat, Fachschaft, Hannover, Finf},
bookmarks=true,
bookmarksopenlevel=subsection,
colorlinks=true,
urlcolor=Blue,
linkcolor=Black,
citecolor=Red,
hyperindex=true
}
\parindent0mm
\begin{document}
\tableofcontents
$body$
\end{document}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment