Importing old logbooks

This commit is contained in:
Philip Sargent 2021-02-06 00:18:48 +00:00
parent a4d7183260
commit 5836c6ff90
6 changed files with 356 additions and 48 deletions

268
core/views_logbooks2.py Normal file
View File

@ -0,0 +1,268 @@
import datetime
import os.path
import re
import django.db.models
from django.db.models import Min, Max
from django.urls import reverse
from django.http import HttpResponse, HttpResponseRedirect
from django.shortcuts import render, render_to_response
from django.template import Context, loader
from django.template.defaultfilters import slugify
from django.utils import timezone
from django.views.generic.list import ListView
import troggle.core.models as models
import troggle.parsers.logbooks as logbookparsers
from troggle.core.forms import getTripForm # , get_name, PersonForm
from troggle.core.models import Expedition, Person, PersonExpedition
from troggle.core.models_caves import LogbookEntry, PersonTrip
from troggle.core.models_survex import SurvexBlock
from troggle.helper import login_required_if_public
from troggle.parsers.logbooks import LoadLogbookForExpedition
from troggle.parsers.people import GetPersonExpeditionNameLookup
import troggle.settings as settings
# Django uses Context, not RequestContext when you call render
# to_response. We always want to use RequestContext, so that
# django adds the context from settings.TEMPLATE_CONTEXT_PROCESSORS.
# This way we automatically get necessary settings variables passed
# to each template. So we use a custom method, render_response
# instead of render_to_response. Hopefully future Django releases
# will make this unnecessary.
# from troggle.alwaysUseRequestContext import render_response
# Deprecated in 1.11.29
# @django.db.models.permalink #this allows the nice get_absolute_url syntax we are using
def getNotablePersons():
notablepersons = []
for person in Person.objects.all():
if person.bisnotable():
notablepersons.append(person)
return notablepersons
def personindex(request):
persons = Person.objects.all()
# From what I can tell, "persons" seems to be the table rows, while "personss" is the table columns. - AC 16 Feb 09
personss = [ ]
ncols = 4
nc = int((len(persons) + ncols - 1) / ncols)
for i in range(ncols):
personss.append(persons[i * nc: (i + 1) * nc])
notablepersons = []
for person in Person.objects.all():
if person.bisnotable():
notablepersons.append(person)
return render(request,'personindex.html', {'persons': persons, 'personss':personss, 'notablepersons':notablepersons})
def expedition(request, expeditionname):
this_expedition = Expedition.objects.get(year=int(expeditionname))
expeditions = Expedition.objects.all()
personexpeditiondays = [ ]
dateditems = list(this_expedition.logbookentry_set.all()) + list(this_expedition.survexblock_set.all())
dates = sorted(set([item.date for item in dateditems]))
for personexpedition in this_expedition.personexpedition_set.all():
prow = [ ]
for date in dates:
pcell = { "persontrips": PersonTrip.objects.filter(personexpedition=personexpedition,
logbook_entry__date=date) }
pcell["survexblocks"] = set(SurvexBlock.objects.filter(survexpersonrole__personexpedition=personexpedition,
date = date))
prow.append(pcell)
personexpeditiondays.append({"personexpedition":personexpedition, "personrow":prow})
if "reload" in request.GET:
LoadLogbookForExpedition(this_expedition)
return render(request,'expedition.html', {'expedition': this_expedition, 'expeditions':expeditions, 'personexpeditiondays':personexpeditiondays, 'settings':settings, 'dateditems': dateditems })
def get_absolute_url(self):
return ('expedition', (expedition.year))
class ExpeditionListView(ListView): # django thus expects a template called "expedition_list.html"
# from the name of the object not the name of the class.
model = Expedition
class Expeditions_tsvListView(ListView):
"""This uses the Django built-in shortcut mechanism
It defaults to use a template with name <app-label>/<model-name>_list.html.
https://www.agiliq.com/blog/2017/12/when-and-how-use-django-listview/
https://developer.mozilla.org/en-US/docs/Learn/Server-side/Django/Generic_views
Either a queryset variable or set_queryset() function is used, but not needed
if you want all the obejcts of a particaulr type in which case just set model = <object>
"""
template_name = 'core/expeditions_tsv_list.html' # if not present then uses core/expedition_list.html
#queryset = Expedition.objects.all()
#context_object_name = 'expedition'
model = Expedition # equivalent to .objects.all() for a queryset
class Expeditions_jsonListView(ListView):
template_name = 'core/expeditions_json_list.html'
model = Expedition
def person(request, first_name='', last_name='', ):
this_person = Person.objects.get(first_name = first_name, last_name = last_name)
# This is for removing the reference to the user's profile, in case they set it to the wrong person
if request.method == 'GET':
if request.GET.get('clear_profile')=='True':
this_person.user=None
this_person.save()
return HttpResponseRedirect(reverse('profiles_select_profile'))
return render(request,'person.html', {'person': this_person, })
def GetPersonChronology(personexpedition):
'''Horrible bug here whern there is more than one survex block per day, it duplicates the entry but gets it wrong
Fortunately this is just the display on this page which is wroing, no bad calculations get into the database.
'''
res = { }
for persontrip in personexpedition.persontrip_set.all():
a = res.setdefault(persontrip.logbook_entry.date, { })
a.setdefault("persontrips", [ ]).append(persontrip)
for personrole in personexpedition.survexpersonrole_set.all():
a = res.setdefault(personrole.survexblock.date, { })
a.setdefault("personroles", [ ]).append(personrole.survexblock)
# build up the tables
rdates = sorted(list(res.keys()))
res2 = [ ]
for rdate in rdates:
persontrips = res[rdate].get("persontrips", [])
personroles = res[rdate].get("personroles", [])
for n in range(max(len(persontrips), len(personroles) )):
res2.append(((n == 0 and rdate or "--"), (n < len(persontrips) and persontrips[n]), (n < len(personroles) and personroles[n]) ))
return res2
def personexpedition(request, first_name='', last_name='', year=''):
person = Person.objects.get(first_name = first_name, last_name = last_name)
this_expedition = Expedition.objects.get(year=year)
personexpedition = person.personexpedition_set.get(expedition=this_expedition)
personchronology = GetPersonChronology(personexpedition)
return render(request,'personexpedition.html', {'personexpedition': personexpedition, 'personchronology':personchronology})
def logbookentry(request, date, slug):
this_logbookentry = LogbookEntry.objects.filter(date=date, slug=slug)
if len(this_logbookentry)>1:
return render(request, 'object_list.html',{'object_list':this_logbookentry})
else:
this_logbookentry=this_logbookentry[0]
return render(request, 'logbookentry.html', {'logbookentry': this_logbookentry})
def logbookSearch(request, extra):
query_string = ''
found_entries = None
if ('q' in request.GET) and request.GET['q'].strip():
query_string = request.GET['q']
entry_query = search.get_query(query_string, ['text','title',])
found_entries = LogbookEntry.objects.filter(entry_query)
return render(request,'logbooksearch.html',
{ 'query_string': query_string, 'found_entries': found_entries, })
#context_instance=RequestContext(request))
def personForm(request,pk):
person=Person.objects.get(pk=pk)
form=PersonForm(instance=person)
return render(request,'personform.html', {'form':form,})
# tried to delete all this, and the reference in urls.py, but got impenetrable django error message
# @login_required_if_public
# def newLogbookEntry(request, expeditionyear, pdate = None, pslug = None):
# expedition = Expedition.objects.get(year=expeditionyear)
# PersonTripFormSet, TripForm = getTripForm(expedition)
# if pslug and pdate:
# previousdate = datetime.date(*[int(x) for x in pdate.split("-")])
# previouslbe = LogbookEntry.objects.get(slug = pslug, date = previousdate, expedition = expedition)
# assert previouslbe.filename
# if request.method == 'POST': # If the form has been submitted...
# tripForm = TripForm(request.POST) # A form bound to the POST data
# personTripFormSet = PersonTripFormSet(request.POST)
# if tripForm.is_valid() and personTripFormSet.is_valid(): # All validation rules pass
# dateStr = tripForm.cleaned_data["date"].strftime("%Y-%m-%d")
# directory = os.path.join(settings.EXPOWEB,
# "years",
# expedition.year,
# "autologbook")
# filename = os.path.join(directory,
# dateStr + "." + slugify(tripForm.cleaned_data["title"])[:50] + ".html")
# if not os.path.isdir(directory):
# os.mkdir(directory)
# if pslug and pdate:
# delLogbookEntry(previouslbe)
# f = open(filename, "w")
# template = loader.get_template('dataformat/logbookentry.html')
# context = Context({'trip': tripForm.cleaned_data,
# 'persons': personTripFormSet.cleaned_data,
# 'date': dateStr,
# 'expeditionyear': expeditionyear})
# f.write(template.render(context))
# f.close()
# print((logbookparsers.parseAutoLogBookEntry(filename)))
# return HttpResponseRedirect(reverse('expedition', args=[expedition.year])) # Redirect after POST
# else:
# if pslug and pdate:
# if previouslbe.cave:
# tripForm = TripForm(initial={"date": previousdate,
# "title": previouslbe.title,
# "cave": previouslbe.cave.reference(),
# "location": None,
# "caveOrLocation": "cave",
# "html": previouslbe.text})
# else:
# tripForm = TripForm(initial={"date": previousdate,
# "title": previouslbe.title,
# "cave": None,
# "location": previouslbe.place,
# "caveOrLocation": "location",
# "html": previouslbe.text})
# personTripFormSet = PersonTripFormSet(initial=[{"name": get_name(py.personexpedition),
# "TU": py.time_underground,
# "author": py.is_logbook_entry_author}
# for py in previouslbe.persontrip_set.all()])
# else:
# tripForm = TripForm() # An unbound form
# personTripFormSet = PersonTripFormSet()
# return render(request, 'newlogbookentry.html', {
# 'tripForm': tripForm,
# 'personTripFormSet': personTripFormSet,
# })
# @login_required_if_public
# def deleteLogbookEntry(request, expeditionyear, date = None, slug = None):
# expedition = Expedition.objects.get(year=expeditionyear)
# previousdate = datetime.date(*[int(x) for x in date.split("-")])
# previouslbe = LogbookEntry.objects.get(slug = slug, date = previousdate, expedition = expedition)
# delLogbookEntry(previouslbe)
# return HttpResponseRedirect(reverse('expedition', args=[expedition.year])) # Redirect after POST
# def delLogbookEntry(lbe):
# for pt in lbe.persontrip_set.all():
# pt.delete()
# lbe.delete()
# os.remove(lbe.filename)
def get_people(request, expeditionslug):
exp = Expedition.objects.get(year = expeditionslug)
return render(request,'options.html', {"items": [(pe.slug, pe.name) for pe in exp.personexpedition_set.all()]})
def get_logbook_entries(request, expeditionslug):
exp = Expedition.objects.get(year = expeditionslug)
return render(request,'options.html', {"items": [(le.slug, "%s - %s" % (le.date, le.title)) for le in exp.logbookentry_set.all()]})

View File

@ -37,9 +37,10 @@ def GetTripPersons(trippeople, expedition, logtime_underground):
tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
print((" - No name match for: '%s'" % tripperson))
message = "No name match for: '%s' in year '%s'" % (tripperson, expedition.year)
message = "No name match for: ||'%s'|| in year '%s'" % (tripperson, expedition.year)
print(message)
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[expedition.year + "~" + tripperson]=message
res.append((personyear, logtime_underground))
if mul:
author = personyear
@ -91,6 +92,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
print(" ! - Skipping logentry: " + title + " - no author for entry")
message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
return
# This needs attention. The slug field is derived from 'title'
@ -133,7 +135,7 @@ def ParseDate(tripdate, year):
else:
message = " ! - Bad date in logbook: " + tripdate + " - " + year
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["author"]=message
logdataissues["tripdate"]=message
assert False, tripdate
return datetime.date(year, month, day)
@ -254,57 +256,77 @@ def Parseloghtmltxt(year, expedition, txt):
"html", tripid1, logbook_entry_count)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
def Parseloghtml01(year, expedition, txt):
global logentries
global logdataissues
errorcount = 0
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
try:
tripentry = year + "." + str(logbook_entry_count)
s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
if not s:
message = " ! - Skipping logentry on failure to parse header: " + tripentry + trippara[:300] + "..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tripentry]=message
print(message)
break
tripheader, triptext = s.group(1), s.group(2)
mtripid = re.search(r'<a id="(.*?)"', tripheader)
tripid = mtripid and mtripid.group(1) or ""
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year)
s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
assert s, trippara[:300]
tripheader, triptext = s.group(1), s.group(2)
mtripid = re.search(r'<a id="(.*?)"', tripheader)
tripid = mtripid and mtripid.group(1) or ""
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
else:
tu = ""
tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year)
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
else:
tu = ""
triptitles = triptitle.split(" - ")
tripcave = triptitles[0].strip()
triptitles = triptitle.split(" - ")
tripcave = triptitles[0].strip()
ltriptext = triptext
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
ltriptext = triptext
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tripid)
logentries.append(entrytuple)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tripid)
logentries.append(entrytuple)
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html01", tripid, logbook_entry_count)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html01", tripid, logbook_entry_count)
except:
message = " ! - Skipping logentry due to exception in: " + tripentry
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tripentry]=message
print(message)
errorcount += 1
if errorcount >5 :
message = " !!- TOO MANY ERRORS - aborting logbook: " + year
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tripentry]=message
print(message)
return
# parser for 2003
def Parseloghtml03(year, expedition, txt):
@ -473,6 +495,8 @@ def LoadLogbookForExpedition(expedition,expect):
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database.
Parser settings appropriate for each year are set in settings.py LOGBOOK_PARSER_SETTINGS.
This should be rewritten to use coroutines to load all logbooks from disc in parallel.
"""
global logdataissues
@ -481,13 +505,14 @@ def LoadLogbooks():
expos = Expedition.objects.all()
if len(expos) <= 1:
print(" ! No expeditions found. Load 'people' first.\n")
nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
"1985","1986","1987","1988","1989","1990",]
entries = {"2020": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
nologbook = ["1976", "1977", "1978", "1979", "1980", "1981", "1986", "2020",]
entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1982": 0}
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
"1985": 1,"1984": 1,"1983": 1,"1982": 42,}
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
try:
os.remove("loadlogbk.log")
except OSError:
@ -503,8 +528,8 @@ def LoadLogbooks():
nlbe[expo.year]=numentries
expd[expo.year]= 0
print("** total trips in ObjStore:", len(trips))
for i in logdataissues:
print("{:15s}: {}".format(i, logdataissues[i]))
#for i in logdataissues:
# print("{:15s}: {}".format(i, logdataissues[i]))
for lbe in trips:
year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe]
@ -513,7 +538,7 @@ def LoadLogbooks():
for y in expd:
print("{} {}".format(y, expd[y]), nlbe[y])
yt += expd[y]
print("{} total".format(yt))
print("total {} log entries in all expeditions".format(yt))
with shelve.open('logbktrips.shelve',writeback=True) as odb:
for lbe in trips:

View File

@ -5,6 +5,12 @@ from utils import save_carefully
from html.parser import HTMLParser
from unidecode import unidecode
'''These functions do not match how the stand-alone script works. So the script produces an HTML file which has
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
'''
def parseMugShotAndBlurb(personline, header, person):
"""create mugshot Photo instance"""
mugShotFilename=personline[header["Mugshot"]]

View File

@ -1252,6 +1252,7 @@ def LoadPositions():
print(" - Regenerating stale cavern .log and .3d for '{}'\n days old: {:.1f} {:.1f} {:.1f}".
format(topdata, (svx_t - d3d_t)/(24*3600), (cav_t - d3d_t)/(24*3600), (now - d3d_t)/(24*3600)))
call([settings.CAVERN, "--log", "--output={}".format(topdata), "{}.svx".format(topdata)])
print(" - Regenerating {} {}.3d in {}".format(settings.THREEDTOPOS, topdata, settings.SURVEX_DATA))
call([settings.THREEDTOPOS, '{}.3d'.format(topdata)], cwd = settings.SURVEX_DATA)
topdata = settings.SURVEX_DATA + settings.SURVEX_TOPNAME

View File

@ -87,6 +87,14 @@ LOGBOOK_PARSER_SETTINGS = {
"1993": ("1993/log.htm", "Parseloghtml01"),
"1992": ("1992/log.htm", "Parseloghtml01"),
"1991": ("1991/log.htm", "Parseloghtml01"),
"1990": ("1990/log.htm", "Parseloghtml01"),
"1989": ("1989/log.htm", "Parseloghtml01"),
"1988": ("1988/log.htm", "Parseloghtml01"),
"1987": ("1987/log.htm", "Parseloghtml01"),
"1985": ("1985/log.htm", "Parseloghtml01"),
"1984": ("1984/log.htm", "Parseloghtml01"),
"1983": ("1983/log.htm", "Parseloghtml01"),
"1982": ("1982/log.htm", "Parseloghtml01"),
}
APPEND_SLASH = False

View File

@ -16,7 +16,7 @@
<body onLoad="contentHeight();">
<div id="header">
<h1>CUCC Expeditions to Austria: 1976 - 2020</h1>
<h1>CUCC Expeditions to Austria: 1976 - 2021</h1>
<div id="editLinks"> {% block loginInfo %}
<a href="{{settings.EXPOWEB_URL}}">Website home</a> |
{% if user.username %}