2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-04-27 21:21:45 +01:00

Object storage as alternative to SQL

This commit is contained in:
Philip Sargent 2020-06-08 21:33:32 +01:00
parent 8c965015f3
commit d807e3de7d

@ -5,6 +5,7 @@ import os
import pickle import pickle
import re import re
import time import time
import shelve
from django.conf import settings from django.conf import settings
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
@ -96,21 +97,18 @@ def GetCaveLookup():
return Gcavelookup return Gcavelookup
logentries = [] # the entire logbook is a single object: a list of entries logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "Journey", "Loser Plateau" ] noncaveplaces = [ "Journey", "Loser Plateau" ]
logdataissues = {}
trips ={}
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"): def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
""" saves a logbook entry and related persontrips """ """ saves a logbook entry and related persontrips """
global logentries
entrytuple = (date, place, title, text,
trippeople, expedition, logtime_underground, entry_type)
logentries.append(entrytuple)
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author: if not author:
print(" * Skipping logentry: " + title + " - no author for entry") print(" ! - Skipping logentry: " + title + " - no author for entry")
message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
DataIssue.objects.create(parser='logbooks', message=message) DataIssue.objects.create(parser='logbooks', message=message)
return return
@ -149,8 +147,13 @@ def ParseDate(tripdate, year):
# 2006, 2008 - 2009 # 2006, 2008 - 2009
def Parselogwikitxt(year, expedition, txt): def Parselogwikitxt(year, expedition, txt):
global logentries
global logdataissues
logbook_entry_count = 0
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara: for triphead, triptext in trippara:
logbook_entry_count += 1
tripheadp = triphead.split("|") tripheadp = triphead.split("|")
assert len(tripheadp) == 3, (tripheadp, triptext) assert len(tripheadp) == 3, (tripheadp, triptext)
tripdate, tripplace, trippeople = tripheadp tripdate, tripplace, trippeople = tripheadp
@ -164,15 +167,57 @@ def Parselogwikitxt(year, expedition, txt):
tu = "" tu = ""
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) tripid =""
entrytuple = (ldate, tripcave, tripsplace, triptext,
trippeople, expedition, tu, "wiki", tripid)
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
expedition=expedition, logtime_underground=0)
EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
tu, "wiki", tripid, logbook_entry_count)
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq):
# This will need additional functions to replicate the persontrip calculation and storage. For the
# moment we leave all that to be done in the django db
global trips # should be a singleton class object in models.py eventually
global logdataissues
if tripid1 is None or tripid1 =="":
tid= "n{}-s{:02d}".format(str(date),seq)
#print(" - New id ",tid)
else:
tid= tripid1
if tid in trips:
msg = " ! DUPLICATE id .{}. {} ~{}~".format(tid, trips[tid][0], trips[tid][1])
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
tid= "d{}-s{:02d}".format(str(date),seq)
#print(" - De-dup ",seq, tid)
logdataissues[tid]=msg
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
## copy a lot of checking functionality here from EnterLogIntoDbase()
# GetTripPersons is a db query, so this will need to be put in ObjStore before this will work..
# or design a different way to do it.
#trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
# if not author:
# print(" ! - Skipping logentry: " + title + " - no author for entry")
# message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid+"author"]=message
# 2002, 2004, 2005, 2007, 2010 - now # 2002, 2004, 2005, 2007, 2010 - now
def Parseloghtmltxt(year, expedition, txt): def Parseloghtmltxt(year, expedition, txt):
#print(" - Starting log html parser") global logentries
global logdataissues
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0 logbook_entry_count = 0
for trippara in tripparas: for trippara in tripparas:
#print(" - HR detected - maybe a trip?")
logbook_entry_count += 1 logbook_entry_count += 1
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
@ -186,7 +231,10 @@ def Parseloghtmltxt(year, expedition, txt):
''', trippara) ''', trippara)
if not s: if not s:
if not re.search(r"Rigging Guide", trippara): if not re.search(r"Rigging Guide", trippara):
print(("can't parse: ", trippara)) # this is 2007 which needs editing msg = " !- can't parse: {}".format(trippara) # this is 2007 which needs editing
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
logdataissues[tid]=msg
continue continue
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
@ -198,14 +246,28 @@ def Parseloghtmltxt(year, expedition, txt):
ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip() ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html", tripid1)
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0, trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html") entry_type="html")
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html", tripid1, logbook_entry_count)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt): def Parseloghtml01(year, expedition, txt):
global logentries
global logdataissues
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0
for trippara in tripparas: for trippara in tripparas:
logbook_entry_count += 1
s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
assert s, trippara[:300] assert s, trippara[:300]
tripheader, triptext = s.group(1), s.group(2) tripheader, triptext = s.group(1), s.group(2)
@ -238,14 +300,27 @@ def Parseloghtml01(year, expedition, txt):
ltriptext = re.sub(r"</?i>", "''", ltriptext) ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext) ltriptext = re.sub(r"</?b>", "'''", ltriptext)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tripid)
logentries.append(entrytuple)
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0, trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html") entry_type="html")
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html01", tripid, logbook_entry_count)
# parser for 2003 # parser for 2003
def Parseloghtml03(year, expedition, txt): def Parseloghtml03(year, expedition, txt):
global logentries
global logdataissues
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0
for trippara in tripparas: for trippara in tripparas:
logbook_entry_count += 1
s = re.match("(?s)\s*<p>(.*?)</p>(.*)$", trippara) s = re.match("(?s)\s*<p>(.*?)</p>(.*)$", trippara)
assert s, trippara assert s, trippara
tripheader, triptext = s.group(1), s.group(2) tripheader, triptext = s.group(1), s.group(2)
@ -256,7 +331,7 @@ def Parseloghtml03(year, expedition, txt):
if re.match("T/U|Time underwater", sheader[-1]): if re.match("T/U|Time underwater", sheader[-1]):
tu = sheader.pop() tu = sheader.pop()
if len(sheader) != 3: if len(sheader) != 3:
print(("header not three pieces", sheader)) print((" ! Header not three pieces", sheader))
tripdate, triptitle, trippeople = sheader tripdate, triptitle, trippeople = sheader
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" , ") triptitles = triptitle.split(" , ")
@ -268,10 +343,20 @@ def Parseloghtml03(year, expedition, txt):
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
tid= "n{}-s{:02d}".format(str(ldate),logbook_entry_count)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html03", tid)
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
text = ltriptext, trippeople=trippeople, expedition=expedition, text = ltriptext, trippeople=trippeople, expedition=expedition,
logtime_underground=0, entry_type="html") logtime_underground=0, entry_type="html")
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html03", tid, logbook_entry_count)
def SetDatesFromLogbookEntries(expedition): def SetDatesFromLogbookEntries(expedition):
""" """
@ -292,14 +377,29 @@ def SetDatesFromLogbookEntries(expedition):
persontrip.save() persontrip.save()
def LoadLogbookForExpedition(expedition,numentries): def LoadLogbookForExpedition(expedition,expect):
""" Parses all logbook entries for one expedition """ Parses all logbook entries for one expedition
If a cache is found it uses it. If not found, or fails sanity checks, parses source file.
""" """
# absolutely horrid. REFACTOR THIS (all my fault..)
global logentries global logentries
logbook_parseable = False logbook_parseable = False
logbook_cached = False logbook_cached = False
yearlinks = settings.LOGBOOK_PARSER_SETTINGS yearlinks = settings.LOGBOOK_PARSER_SETTINGS
expologbase = os.path.join(settings.EXPOWEB, "years") expologbase = os.path.join(settings.EXPOWEB, "years")
logentries=[]
def validcache(year,n):
if year != expedition:
print(" ! year != expedition ",year, expedition )
return False
if len(logentries) != n:
print(" ! len(logentries) != n ",len(logentries), n )
return False
if n != expect:
print(" ! n != expect ",n, expect )
return False
return True
if expedition.year in yearlinks: if expedition.year in yearlinks:
logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0]) logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
@ -326,11 +426,15 @@ def LoadLogbookForExpedition(expedition,numentries):
print(" - Reading cache: " + cache_filename, end='') print(" - Reading cache: " + cache_filename, end='')
try: try:
with open(cache_filename, "rb") as f: with open(cache_filename, "rb") as f:
logentries = pickle.load(f) year,n,logentries = pickle.load(f)
print(" -- Loaded ", len(logentries), " log entries") if validcache(year,n):
logbook_cached = True print(" -- Loaded ", len(logentries), " log entries")
logbook_cached = True
else:
print(" !- Should be ", expect, " but ", len(logentries), " found in cache")
raise
except: except:
print("\n ! Failed to load corrupt cache. Deleting it.\n") print(" ! Failed to load corrupt cache. Deleting it.")
os.remove(cache_filename) os.remove(cache_filename)
logentries=[] logentries=[]
raise raise
@ -350,50 +454,75 @@ def LoadLogbookForExpedition(expedition,numentries):
parser = globals()[parsefunc] parser = globals()[parsefunc]
parser(expedition.year, expedition, txt) parser(expedition.year, expedition, txt)
SetDatesFromLogbookEntries(expedition) SetDatesFromLogbookEntries(expedition)
# and this has also stored all the log entries in logentries[]
if len(logentries) >0: if len(logentries) >0:
print(" - Cacheing " , len(logentries), " log entries") print(" - Cacheing " , len(logentries), " log entries")
with open(cache_filename, "wb") as fc: with open(cache_filename, "wb") as fc:
pickle.dump(logentries, fc, 2) logbk=(expedition,len(logentries),logentries)
pickle.dump(logbk, fc, protocol=4)
else: else:
print(" ! NO TRIP entries found in logbook, check the syntax.") print(" ! NO TRIP entries found in logbook, check the syntax.")
if logbook_cached: if logbook_cached: # working on this bit...
i=0 i=0
for entrytuple in range(len(logentries)): for entrytuple in range(len(logentries)):
date, place, title, text, trippeople, expedition, logtime_underground, \ date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = logentries[i]
entry_type = logentries[i] EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\ entry_type)
entry_type) EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground,
entry_type, tripid1, i)
i +=1 i +=1
SetDatesFromLogbookEntries(expedition)
return len(logentries) return len(logentries)
def LoadLogbooks(): def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """ This is the master function for parsing all logbooks into the Troggle database.
""" """
global logdataissues
logdataissues = {}
DataIssue.objects.filter(parser='logbooks').delete() DataIssue.objects.filter(parser='logbooks').delete()
expos = Expedition.objects.all() expos = Expedition.objects.all()
if len(expos) <= 1: if len(expos) <= 1:
print(" ! No expeditions found. Load 'people' first.") print(" ! No expeditions found. Load 'people' first.\n")
nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984", nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
"1985","1986","1987","1988","1989","1990",] "1985","1986","1987","1988","1989","1990",]
entries = {"2020": 0, "2019": 40, "2018": 148, "2017": 120, "2016": 162, "2015": 158, entries = {"2020": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
"2014": 130, "2013": 102, "2012": 150, "2011": 136, "2010": 44, "2009": 104, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 98, "2007": 222, "2006": 48, "2005": 110, "2004": 152, "2003": 80, "2002": 62, "2008": 49, "2007": 111, "2006": 24, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
"2001": 96, "2000": 108, "1999": 158, "1998": 86, "1997": 106, "1996": 188, "1995": 82, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
"1994": 64, "1993": 82, "1992": 122, "1991": 76, "1982": 76} "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1982": 0}
try: try:
os.remove("loadlogbk.log") os.remove("loadlogbk.log")
except OSError: except OSError:
pass pass
nlbe={}
expd ={}
with open("loadlogbk.log", "a") as log: with open("loadlogbk.log", "a") as log:
for expo in expos: for expo in expos:
if expo.year not in nologbook: if expo.year not in nologbook:
print((" - Logbook for: " + expo.year)) print((" - Logbook for: " + expo.year))
numentries = LoadLogbookForExpedition(expo, entries[expo.year]) numentries = LoadLogbookForExpedition(expo, entries[expo.year])
log.write("{} {} should be {}\n".format(expo.year, numentries, entries[expo.year])) log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year]))
nlbe[expo.year]=numentries
expd[expo.year]= 0
print("** total trips in ObjStore:", len(trips))
for i in logdataissues:
print("{:15s}: {}".format(i, logdataissues[i]))
for lbe in trips:
year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe]
expd[year] += 1
yt=0
for y in expd:
print("{} {}".format(y, expd[y]), nlbe[y])
yt += expd[y]
print("{} total".format(yt))
with shelve.open('logbktrips.shelve',writeback=True) as odb:
for lbe in trips:
odb[lbe]=trips[lbe]
odb.sync()
odb.close()
dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S) expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)