{ "cells": [ { "cell_type": "code", "execution_count": 9, "id": "730af183", "metadata": {}, "outputs": [], "source": [ "from html.parser import HTMLParser\n", "\n", "class MyAdlibHTMLParser(HTMLParser):\n", " def update(self):\n", " self.foundContent = False\n", " self.getLabel = False\n", " self.getValue = False\n", " self.getListFirst = False\n", " self.getList = False\n", " self.currentLabel = ''\n", " self.currentValue = ''\n", " self.d = {}\n", "\n", " def handle_starttag(self, tag, attrs):\n", " dir(self)\n", " if tag == 'ul': \n", " self.foundContent = True\n", " if self.foundContent: \n", " if tag == 'div':\n", " #print(\"Encountered a start tag:\", tag, attrs)\n", " if attrs[-1][-1] == 'label': \n", " self.getLabel = True\n", " self.getValue = False\n", " self.getListFirst = False\n", " self.getList = False\n", " if attrs[-1][-1] == 'value': \n", " self.getLabel = False\n", " self.getValue = True\n", " self.getListFirst = False\n", " self.getList = False\n", " if attrs[-1][-1] == 'separateline-first': \n", " self.getLabel = False\n", " self.getValue = False\n", " self.getListFirst = True\n", " self.getList = False\n", " if attrs[-1][-1] == 'separateline': \n", " self.getLabel = False\n", " self.getValue = False\n", " self.getListFirst = False\n", " self.getList = True\n", " if tag == 'a': \n", " if attrs[0][1] == 'ais-pdf': \n", " self.d['hyperref'] = attrs[1][1]\n", " def handle_endtag(self, tag):\n", " if self.foundContent: \n", " #print(\"Encountered an end tag :\", tag)\n", " if tag == 'ul': \n", " self.foundContent = False\n", "\n", " def handle_data(self, data):\n", " if self.foundContent: \n", " #print(\"Encountered some data :\", data)\n", " if self.getLabel: \n", " self.currentLabel = data\n", " self.getLabel = False\n", " if self.getValue: \n", " self.currentValue = data\n", " self.getValue = False\n", " self.d[self.currentLabel] = self.currentValue\n", " if self.getListFirst: \n", " self.currentValue = data\n", " self.getListFirst = False\n", " self.d[self.currentLabel] = [self.currentValue]\n", " if self.getList: \n", " self.currentValue = data\n", " self.getValue = False\n", " self.d[self.currentLabel].append(self.currentValue)\n", "\n", "def parse_adlib_catalog_entry(html_file):\n", " parser = MyAdlibHTMLParser()\n", " parser.update()\n", " \n", " with open(html_file,'r', encoding=\"utf8\") as f: \n", " text = f.read()\n", " parser.feed(text)\n", " entry = parser.d\n", " authors = [author[::-1].strip()[::-1].split(' ') for author in entry['Author']]\n", " authorslistbib = []\n", "\n", " for author in authors: \n", " if len(author) > 2: \n", " inbetween = ' '.join(author[1:-1])\n", " authorbib = f\"{'{'}\\\\van{'{'+author[-1]+'}{'+inbetween.capitalize()+'}{'+inbetween+'}'}{'}'} {author[-1]}, {author[0]}\"\n", " else:\n", " authorbib = f\"{author[1]}, {author[0]}\"\n", " authorslistbib.append(authorbib)\n", "\n", " entry['Author'] = '{' + ' and '.join(authorslistbib) + '}'\n", "\n", " entry['Citekey'] = ''.join(authors[0][1:]) +entry['Year of publication'][2:4] \n", " return entry\n", "\n", "def save_file(entry):\n", " import shutil, os, subprocess\n", " os.getcwd()\n", " if not os.path.exists('adlib'):\n", " os.mkdir('adlib')\n", " if 'hyperref' in entry.keys():\n", " path, filename = os.path.split(entry['hyperref'])\n", " _, ext = os.path.splitext(filename)\n", " dfilename = os.path.split(entry['hyperref'])[-1]\n", " if not os.path.exists(dfilename):\n", " subprocess.run(r\"curl -O \" + entry['hyperref'])\n", " fullfilename = os.path.join('adlib',entry['Citekey']+ext)\n", " if not os.path.exists(fullfilename):\n", " shutil.copyfile(dfilename, fullfilename)\n", " print(f\"Digital document saved to: {fullfilename}\")\n", " else:\n", " if os.path.getsize(dfilename) == os.path.getsize(fullfilename): \n", " print(f\"Digital document {fullfilename} already exists\")\n", " return 1\n", " return 0\n", " return 1\n", " \n", "def bibtex(entry):\n", " bibtex_entry = ( \n", " f\"@techreport{'{'+entry['Citekey']}, \\n\"\n", " f\" author = {entry['Author']},\\n\"\n", " f\" title = {'{{'+entry['Title']+'}}'},\\n\"\n", " f\" institution = {'{'+entry['Publisher']}, {entry['Place of publication']+'}'},\\n\"\n", " f\" year = {'{'+entry['Year of publication'][0:4]+'}'},\\n\"\n", " f\" type = {'{{'+entry['Material']+'}}'},\\n\"\n", " )\n", " if 'Pagination' in entry.keys(): \n", " bibtex_entry += f\" pages = {'{'+entry['Pagination']+'}'},\\n\" \n", " bibtex_entry += (\n", " f\" address = {'{'+'}'},\\n\"\n", " f\" month = {'{'+entry['Year of publication'][4:]+'}'},\\n\"\n", " f\" note = {'{'+'}'},\\n\"\n", " f\" annote = {'{'+'}'},\\n\"\n", " )\n", "\n", " for key in list(set(entry.keys()) - set(['Citekey','Title','Author','Publisher','Place of publication','Year of publication','Material','Pagination'])):\n", " print(key)\n", " bibtex_entry += f\" {key.replace(' ','').lower():<10} = {'{'+str(entry[key])+'}'},\\n\"\n", "\n", "\n", " bibtex_entry += \"}\\n\"\n", " \n", " return bibtex_entry\n", "\n", "def generate_bib(filename): \n", " import os\n", " entry = parse_adlib_catalog_entry(filename)\n", " print(entry)\n", " base, ext = os.path.splitext(filename)\n", " save_file(entry)\n", " bibentry = bibtex(entry)\n", " print(bibentry)\n", " bibfile = os.path.join('adlib',entry['Citekey']+'.bib')\n", " if not os.path.exists(bibfile): \n", " with open(bibfile, 'w') as f:\n", " f.writelines(bibentry)\n", " print(f\"Bibliography document saved to: {bibfile}\")\n", " else: \n", " print(f\"Bibliography document {bibfile} already exists\") \n" ] }, { "cell_type": "code", "execution_count": 10, "id": "2aeced61", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'Title': 'Ruimtelijke variatie van de sedimentaire structuur en textuur van de bedding van de Grensmaas (stuw Borgharen, km. 15.5 - Maaseik, km. 52.7) : periode van veldonderzoek: 3 juli - 25 augustus 1995', 'Author': '{Sorber, A. and {\\\\van{Vaan}{De}{de}} Vaan, G.}', 'Corporate author': ['Interuniversitair Centrum voor Geo-ecologisch onderzoek'], 'Place of publication': 'Amsterdam', 'Publisher': 'IGC-Bureau', 'Year of publication': '1995 (okt.)', 'Pagination': '98 p.', 'Material': 'monografie', 'Series': 'ICG; 95/3', 'Notes': 'I.o.v. Rijkswaterstaat/RIZA', 'Keywords': 'erodeerbaarheid, erodibility, riviermorfologie, river morphology, korrelgrootteverdeling, grain size distribution, bodemafpleistering, bed armouring', 'Geographical keyword': 'Grensmaas', 'Citekey': 'Sorber95'}\n", "Keywords\n", "Series\n", "Notes\n", "Geographical keyword\n", "Corporate author\n", "@techreport{Sorber95, \n", " author = {Sorber, A. and {\\van{Vaan}{De}{de}} Vaan, G.},\n", " title = {{Ruimtelijke variatie van de sedimentaire structuur en textuur van de bedding van de Grensmaas (stuw Borgharen, km. 15.5 - Maaseik, km. 52.7) : periode van veldonderzoek: 3 juli - 25 augustus 1995}},\n", " institution = {IGC-Bureau, Amsterdam},\n", " year = {1995},\n", " type = {monografie},\n", " pages = {98 p.},\n", " address = {},\n", " month = { (okt.)},\n", " note = {},\n", " annote = {},\n", " keywords = {erodeerbaarheid, erodibility, riviermorfologie, river morphology, korrelgrootteverdeling, grain size distribution, bodemafpleistering, bed armouring},\n", " series = {ICG; 95/3},\n", " notes = {I.o.v. Rijkswaterstaat/RIZA},\n", " geographicalkeyword = {Grensmaas},\n", " corporateauthor = {['Interuniversitair Centrum voor Geo-ecologisch onderzoek']},\n", "}\n", "\n", "Bibliography document adlib\\Sorber95.bib already exists\n" ] } ], "source": [ "import glob\n", "for file in glob.glob('fullCatalogue*.html'):\n", " generate_bib(file)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "23563fec", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c094fcc3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.16" } }, "nbformat": 4, "nbformat_minor": 5 }