2003 logbook export/re-import as now HTML format

This commit is contained in:
Philip Sargent 2022-12-09 23:45:07 +00:00
parent 17b2b7b89c
commit cabcada0b8
6 changed files with 98 additions and 70 deletions

View File

@ -148,16 +148,14 @@ def controlpanel(request):
def exportlogbook(request,year=None,extension=None): def exportlogbook(request,year=None,extension=None):
'''Constructs, from the database, a complete HTML formatted logbook - but TEXT ONLY '''Constructs, from the database, a complete HTML formatted logbook
for the current year. Formats available are HTML2005 or HTML2022 (planned) for the current year. Formats available are HTML2005 (others old & broken or not written yet)
There are no images stored in the database, so this is only a tool for a first pass, to be followed by There are no images stored in the database, so this is only a tool for a first pass, to be followed by
extensive hand-editing. hand-editing. However links to images work int he HTML text of a logbook entry
NEED TO ADD IN THE MATERIAL WHICH IS NOT IN ANY LBE ! e.g. front matter. NEED TO ADD IN THE MATERIAL WHICH IS NOT IN ANY LBE ! e.g. front matter.
This function DOES NOT WORK.
This function is the recipient of the POST action os the export form in the control panel This function is the recipient of the POST action os the export form in the control panel
''' '''
def lbeKey(lbe): def lbeKey(lbe):
@ -170,26 +168,18 @@ def exportlogbook(request,year=None,extension=None):
else: else:
print(f'Logbook export {request.POST}') print(f'Logbook export {request.POST}')
if request.POST.get("year", '2016'): year = request.POST['year']
year = request.POST['year']
if request.POST.get("extension", 'html'):
extension = request.POST['extension'] # e.g. html
current_expedition=Expedition.objects.get(year=year) current_expedition=Expedition.objects.get(year=year)
logbook_entries=LogbookEntry.objects.filter(expedition=current_expedition).order_by('date') # need to be sorted by date! logbook_entries=LogbookEntry.objects.filter(expedition=current_expedition).order_by('date') # need to be sorted by date!
#print(f'Logbook has {len(logbook_entries)} entries in it.') print(f'Logbook has {len(logbook_entries)} entries in it.')
if extension == 'html2005': extension ='html'
response = HttpResponse(content_type='text/html') response = HttpResponse(content_type='text/html')
style='2005' style='2005'
else :
extension == 'html2022'
response = HttpResponse(content_type='text/html')
style='2022'
filename='newlogbook.' + extension filename='logbook-new-format.' + extension
template='logbook'+style+'style.'+extension template='logbook'+style+'style.'+ extension
response['Content-Disposition'] = 'attachment; filename='+filename response['Content-Disposition'] = 'attachment; filename='+filename
t=loader.get_template(template) t=loader.get_template(template)
logbookfile = (t.render({'logbook_entries':logbook_entries})) logbookfile = (t.render({'logbook_entries':logbook_entries}))

View File

@ -47,7 +47,7 @@ from django.db import transaction
from troggle.core.utils import get_process_memory from troggle.core.utils import get_process_memory
from troggle.core.models.caves import Cave, Entrance from troggle.core.models.caves import Cave, Entrance
from troggle.parsers.imports import import_caves, import_people, import_surveyscans, \ from troggle.parsers.imports import import_caves, import_people, import_surveyscans, \
import_logbooks, import_QMs, import_survex, import_loadpos, import_drawingsfiles import_logbooks, import_logbook, import_QMs, import_survex, import_loadpos, import_drawingsfiles
if os.geteuid() == 0: if os.geteuid() == 0:
# This protects the server from having the wrong file permissions written on logs and caches # This protects the server from having the wrong file permissions written on logs and caches
@ -343,7 +343,8 @@ def usage():
drawings - read in the Tunnel & Therion files - which scans the survey scans too drawings - read in the Tunnel & Therion files - which scans the survey scans too
survex - read in the survex files - all the survex blocks and entrances x/y/z survex - read in the survex files - all the survex blocks and entrances x/y/z
dumplogbooks - Not used. write out autologbooks (not working?) dumplogbooks - Not used. write out autologbooks (not working? use http://localhost:8000/controlpanel )
logbook - read a single logbook. Defautl set in python code
and [runlabel] is an optional string identifying this run of the script and [runlabel] is an optional string identifying this run of the script
in the stored profiling data 'import-profile.json' in the stored profiling data 'import-profile.json'
@ -394,6 +395,8 @@ if __name__ == "__main__":
jq.enq("caves",import_caves) jq.enq("caves",import_caves)
elif "logbooks" in sys.argv: elif "logbooks" in sys.argv:
jq.enq("logbooks",import_logbooks) jq.enq("logbooks",import_logbooks)
elif "logbook" in sys.argv:
jq.enq("logbooks",import_logbook) # default year set in imports.py
elif "people" in sys.argv: elif "people" in sys.argv:
jq.enq("people",import_people) jq.enq("people",import_people)
elif "QMs" in sys.argv: elif "QMs" in sys.argv:

View File

@ -41,6 +41,11 @@ def import_logbooks():
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks() troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2003):
print(f"-- Importing Logbook {year}")
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year)
def import_QMs(): def import_QMs():
print("-- Importing old QMs for 161, 204, 234 from CSV files") print("-- Importing old QMs for 161, 204, 234 from CSV files")
with transaction.atomic(): with transaction.atomic():

View File

@ -26,6 +26,8 @@ Parses and imports logbooks in all their wonderful confusion
todo=''' todo='''
- refactor everything with some urgency, esp. LoadLogbookForExpedition() - refactor everything with some urgency, esp. LoadLogbookForExpedition()
- remove the TROG and lbo things since we need the database for multiuser access? Or not?
- profile the code to find bad repetitive things, of which there are many. - profile the code to find bad repetitive things, of which there are many.
- far too many uses of Django field dereferencing to get values, which is SLOW - far too many uses of Django field dereferencing to get values, which is SLOW
@ -55,15 +57,15 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
# but several don't work, and are skipped by the parsing code, e.g. 1983 # but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = { LOGBOOK_PARSER_SETTINGS = {
"2010": ("logbook.html", "parser_html"), "2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "parser_wiki"), "2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "parser_wiki"), "2008": ("2008logbook.txt", "wiki_parser"),
"2007": ("logbook.html", "parser_html"), "2007": ("logbook.html", "parser_html"),
"2006": ("logbook.html", "parser_html"), "2006": ("logbook.html", "parser_html"),
# "2006": ("logbook/logbook_06.txt", "parser_wiki"), # "2006": ("logbook/logbook_06.txt", "wiki_parser"),
"2006": ("logbook.html", "parser_html"), "2006": ("logbook.html", "parser_html"),
"2005": ("logbook.html", "parser_html"), "2005": ("logbook.html", "parser_html"),
"2004": ("logbook.html", "parser_html"), "2004": ("logbook.html", "parser_html"),
"2003": ("logbook.html", "parser_html_03"), "2003": ("logbook.html", "parser_html"),
"2002": ("logbook.html", "parser_html"), "2002": ("logbook.html", "parser_html"),
"2001": ("log.htm", "parser_html_01"), "2001": ("log.htm", "parser_html_01"),
"2000": ("log.htm", "parser_html_01"), "2000": ("log.htm", "parser_html_01"),
@ -88,7 +90,7 @@ LOGBOOK_PARSER_SETTINGS = {
entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79, entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
"1985": 24,"1984": 32,"1983": 52,"1982": 42,} "1985": 24,"1984": 32,"1983": 52,"1982": 42,}
@ -114,7 +116,8 @@ rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = [ ] res = [ ]
author = None author = None
#print(f'# {tid}') # print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ")
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople): for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip() tripperson = tripperson.strip()
@ -147,6 +150,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
return None, None return None, None
author = res[-1][0] author = res[-1][0]
#print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
return res, author return res, author
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
@ -195,9 +199,10 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
#NEW slug for a logbook entry here! Unique id + slugified title fragment #NEW slug for a logbook entry here! Unique id + slugified title fragment
if tid is not None: if tid is not None:
slug = tid + "_" + slugify(title)[:10].replace('-','_') slug = tid
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
else: else:
slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_') slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug} nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
# This creates the lbo instance of LogbookEntry # This creates the lbo instance of LogbookEntry
@ -205,6 +210,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
for tripperson, time_underground in trippersons: for tripperson, time_underground in trippersons:
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo} lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)} nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
# this creates the PersonTrip instance. # this creates the PersonTrip instance.
@ -251,7 +257,7 @@ def ParseDate(tripdate, year):
return datetime.date(1970, 1, 1) return datetime.date(1970, 1, 1)
# (2006 - not any more), 2008 - 2009 # (2006 - not any more), 2008 - 2009
def parser_wiki(year, expedition, txt): def wiki_parser(year, expedition, txt):
global logentries global logentries
global logdataissues global logdataissues
@ -316,6 +322,11 @@ def parser_html(year, expedition, txt):
if s: if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
else: # allow title and people to be swapped in order else: # allow title and people to be swapped in order
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:40]}'..."
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
logdataissues[tid]=msg
s2 = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date s2 = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)? \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)? \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
@ -329,7 +340,7 @@ def parser_html(year, expedition, txt):
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups() tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
else: else:
if not re.search(r"Rigging Guide", trippara): if not re.search(r"Rigging Guide", trippara):
msg = f" !- Logbook. Can't parse {tripid1}: {trippara} entry:{logbook_entry_count} " msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:40]}'..."
print(msg) print(msg)
DataIssue.objects.create(parser='logbooks', message=msg) DataIssue.objects.create(parser='logbooks', message=msg)
logdataissues[tid]=msg logdataissues[tid]=msg
@ -343,7 +354,7 @@ def parser_html(year, expedition, txt):
tripcave = "UNKNOWN" tripcave = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip() ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
entrytuple = (ldate, tripcave, triptitle, ltriptext, entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, tripid1) trippeople, expedition, tu, tripid1)
@ -430,10 +441,10 @@ def parser_html_01(year, expedition, txt):
ltriptext = ltriptext[:mtail.start(0)] ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext) ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
ltriptext = re.sub(r"</?u>", "_", ltriptext) ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext) ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext) ltriptext = re.sub(r"</?b>", "'''", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
if ltriptext == "": if ltriptext == "":
message = " ! - Zero content for logbook entry!: " + tid message = " ! - Zero content for logbook entry!: " + tid
@ -469,7 +480,7 @@ def parser_html_03(year, expedition, txt):
logbook_entry_count = 0 logbook_entry_count = 0
for trippara in tripparas: for trippara in tripparas:
logbook_entry_count += 1 logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count) tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara) s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
if not ( s ) : if not ( s ) :
@ -485,23 +496,30 @@ def parser_html_03(year, expedition, txt):
sheader = tripheader.split(" -- ") sheader = tripheader.split(" -- ")
tu = "" tu = ""
if re.match("T/U|Time underwater", sheader[-1]): if re.match("T/U|Time underwater", sheader[-1]):
tu = sheader.pop() tu = sheader.pop() # not a number in 2003 usually
# print(f" - {logbook_entry_count} '{tu}' ")
if len(sheader) != 3: if len(sheader) != 3:
print(" ! Header not three pieces", sheader) print(" ! Header not three pieces for parser_html_03() ", sheader)
tripdate, triptitle, trippeople = sheader tripdate, triptitle, trippeople = sheader
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" , ") # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
if len(triptitles) >= 2: # print(f" - {logbook_entry_count} '{trippeople}' ")
tripcave = triptitles[0] titlelist = triptitle.split(" , ")
if len(titlelist) >= 2:
location, *namelist = titlelist # list unpacking operator
tripname = ", ".join(namelist) # concatenate strings
# print(f" - {logbook_entry_count} {location} '{tripname}'")
else: else:
tripcave = "UNKNOWN" location = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = triptext + "<br /><br />\n\n" + tu
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
#ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
entrytuple = (ldate, tripcave, triptitle, ltriptext, entrytuple = (ldate, location, tripname, ltriptext,
trippeople, expedition, tu, tid) trippeople, expedition, tu, tid)
logentries.append(entrytuple) logentries.append(entrytuple)
@ -509,8 +527,8 @@ def parser_html_03(year, expedition, txt):
def LoadLogbookForExpedition(expedition): def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """ Parses all logbook entries for one expedition
""" """
# absolutely horrid. REFACTOR THIS (all my fault..)
global logentries global logentries
# absolutely horrid. REFACTOR THIS (all my fault..)
global logdataissues global logdataissues
global entries global entries
@ -557,6 +575,10 @@ def LoadLogbookForExpedition(expedition):
expedition.save() expedition.save()
lbes = LogbookEntry.objects.filter(expedition=expedition)
for lbe in lbes:
lbe.delete()
try: try:
file_in = open(logbookpath,'rb') file_in = open(logbookpath,'rb')
txt = file_in.read().decode("utf-8") txt = file_in.read().decode("utf-8")
@ -594,6 +616,14 @@ def LoadLogbookForExpedition(expedition):
return len(logentries) return len(logentries)
def LoadLogbook(year):
nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache
expo = Expedition.objects.get(year=year)
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
def LoadLogbooks(): def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """ This is the master function for parsing all logbooks into the Troggle database.
This should be rewritten to use coroutines to load all logbooks from disc in parallel, This should be rewritten to use coroutines to load all logbooks from disc in parallel,

View File

@ -101,7 +101,7 @@
<h3>Export to a different format:</h3> <h3>Export to a different format:</h3>
<p>This creates 'newlogbook.html' in the years/&lt;year&gt;/ folder <p>This creates 'logbook-new-format.html' in the years/&lt;year&gt;/ folder
<table> <table>
<tr> <tr>
@ -128,8 +128,7 @@
Output style: Output style:
<select name="extension"> <select name="extension">
<option value="html2005">.html file - 2005 style</option> <option value="html2005">.html file - 2005 style</option>
<option value="html2022">.html file - 2022 style</option> </select>
</select>
</p> </p>
<p> <p>
<input name="download_logbook" type="submit" value="Download logbook" /> <input name="download_logbook" type="submit" value="Download logbook" />

View File

@ -1,26 +1,27 @@
<!DOCTYPE html>
<html> <html>
<head><title>{{logbook_entries.0.expedition}} Expo Logbook</title></head> <head>
<link rel="stylesheet" type="text/css" href="../../css/main2.css" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style type="text/css"> <title>{{logbook_entries.0.expedition}} Expo Logbook</title>
.tripdate { float: left;} <link rel="stylesheet" href="../../css/main2.css" />
.trippeople { float: right;} </head>
.triptitle { font-size: 120%; text-align: center; font-weight: bold; clear: both } <!-- Exported by troggle in this format after having been imported using a different format and a different parser.
.timeug { text-align: right; font-weight: bold } This is because we are steadily converting old formats to a new common format so that we do not need to maintain half
p { clear: both } a dozen parser functions.
</style> Exported on {% now 'Y-m-d D' %} using control panel webpage and exportlogbook() in troggle/code/views/other.py
-->
<body> <body>
<h1>Expo {{logbook_entries.0.expedition}}</h1> <h1>Expo {{logbook_entries.0.expedition}}</h1>
{%for logbook_entry in logbook_entries%} {%for logbook_entry in logbook_entries%}
<hr /> <hr />
<div class="tripdate" id="t{{logbook_entry.date}}A">{{logbook_entry.date}}</div> <div class="tripdate" id="{{logbook_entry.slug}}">{{logbook_entry.date|date:'Y-m-d'}}</div>
<div class="trippeople"><u>{{logbook_entry.author.person}}</u> <div class="trippeople">{% for persontrip in logbook_entry.persontrip_set.all %}{% if persontrip.is_logbook_entry_author %}<u>{{persontrip.personexpedition.person}}</u>{% else %}{{ persontrip.personexpedition.person }}{% endif %}, {% endfor %}</div>
{% for persontrip in logbook_entry.persontrip_set.all %}{{ persontrip.personexpedition.person }} {{ persontrip.personexpedition.time_underground }}, {% endfor %}
</div>
<div class="triptitle">{{logbook_entry.place}} - {{logbook_entry.title}}</div> <div class="triptitle">{{logbook_entry.place}} - {{logbook_entry.title}}</div>
{{logbook_entry.text|safe}} {{logbook_entry.text|safe}}
<div class="timeug">T/U: {{logbook_entry.time_underground}}</div>
{% endfor %} {% endfor %}
<hr />
</body>
</html>