mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2024-11-21 23:01:52 +00:00
Object storage as alternative to SQL
This commit is contained in:
parent
8c965015f3
commit
d807e3de7d
@ -5,6 +5,7 @@ import os
|
||||
import pickle
|
||||
import re
|
||||
import time
|
||||
import shelve
|
||||
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
@ -96,21 +97,18 @@ def GetCaveLookup():
|
||||
return Gcavelookup
|
||||
|
||||
|
||||
logentries = [] # the entire logbook is a single object: a list of entries
|
||||
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
||||
noncaveplaces = [ "Journey", "Loser Plateau" ]
|
||||
logdataissues = {}
|
||||
trips ={}
|
||||
|
||||
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
|
||||
""" saves a logbook entry and related persontrips """
|
||||
global logentries
|
||||
|
||||
entrytuple = (date, place, title, text,
|
||||
trippeople, expedition, logtime_underground, entry_type)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
|
||||
if not author:
|
||||
print(" * Skipping logentry: " + title + " - no author for entry")
|
||||
message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
|
||||
print(" ! - Skipping logentry: " + title + " - no author for entry")
|
||||
message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
return
|
||||
|
||||
@ -149,8 +147,13 @@ def ParseDate(tripdate, year):
|
||||
|
||||
# 2006, 2008 - 2009
|
||||
def Parselogwikitxt(year, expedition, txt):
|
||||
global logentries
|
||||
global logdataissues
|
||||
|
||||
logbook_entry_count = 0
|
||||
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
|
||||
for triphead, triptext in trippara:
|
||||
logbook_entry_count += 1
|
||||
tripheadp = triphead.split("|")
|
||||
assert len(tripheadp) == 3, (tripheadp, triptext)
|
||||
tripdate, tripplace, trippeople = tripheadp
|
||||
@ -164,15 +167,57 @@ def Parselogwikitxt(year, expedition, txt):
|
||||
tu = ""
|
||||
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
|
||||
tripid =""
|
||||
|
||||
entrytuple = (ldate, tripcave, tripsplace, triptext,
|
||||
trippeople, expedition, tu, "wiki", tripid)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
|
||||
expedition=expedition, logtime_underground=0)
|
||||
|
||||
EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
|
||||
tu, "wiki", tripid, logbook_entry_count)
|
||||
|
||||
|
||||
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq):
|
||||
# This will need additional functions to replicate the persontrip calculation and storage. For the
|
||||
# moment we leave all that to be done in the django db
|
||||
global trips # should be a singleton class object in models.py eventually
|
||||
global logdataissues
|
||||
|
||||
if tripid1 is None or tripid1 =="":
|
||||
tid= "n{}-s{:02d}".format(str(date),seq)
|
||||
#print(" - New id ",tid)
|
||||
else:
|
||||
tid= tripid1
|
||||
if tid in trips:
|
||||
msg = " ! DUPLICATE id .{}. {} ~{}~".format(tid, trips[tid][0], trips[tid][1])
|
||||
print(msg)
|
||||
DataIssue.objects.create(parser='logbooks', message=msg)
|
||||
tid= "d{}-s{:02d}".format(str(date),seq)
|
||||
#print(" - De-dup ",seq, tid)
|
||||
logdataissues[tid]=msg
|
||||
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
|
||||
|
||||
## copy a lot of checking functionality here from EnterLogIntoDbase()
|
||||
# GetTripPersons is a db query, so this will need to be put in ObjStore before this will work..
|
||||
# or design a different way to do it.
|
||||
#trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
|
||||
# if not author:
|
||||
# print(" ! - Skipping logentry: " + title + " - no author for entry")
|
||||
# message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
|
||||
# DataIssue.objects.create(parser='logbooks', message=message)
|
||||
# logdataissues[tid+"author"]=message
|
||||
|
||||
# 2002, 2004, 2005, 2007, 2010 - now
|
||||
def Parseloghtmltxt(year, expedition, txt):
|
||||
#print(" - Starting log html parser")
|
||||
global logentries
|
||||
global logdataissues
|
||||
|
||||
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
||||
logbook_entry_count = 0
|
||||
for trippara in tripparas:
|
||||
#print(" - HR detected - maybe a trip?")
|
||||
logbook_entry_count += 1
|
||||
|
||||
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
||||
@ -186,7 +231,10 @@ def Parseloghtmltxt(year, expedition, txt):
|
||||
''', trippara)
|
||||
if not s:
|
||||
if not re.search(r"Rigging Guide", trippara):
|
||||
print(("can't parse: ", trippara)) # this is 2007 which needs editing
|
||||
msg = " !- can't parse: {}".format(trippara) # this is 2007 which needs editing
|
||||
print(msg)
|
||||
DataIssue.objects.create(parser='logbooks', message=msg)
|
||||
logdataissues[tid]=msg
|
||||
continue
|
||||
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
@ -198,14 +246,28 @@ def Parseloghtmltxt(year, expedition, txt):
|
||||
ltriptext = re.sub(r"</p>", "", triptext)
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
|
||||
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||
trippeople, expedition, tu, "html", tripid1)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
|
||||
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
||||
entry_type="html")
|
||||
|
||||
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
||||
"html", tripid1, logbook_entry_count)
|
||||
|
||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||
def Parseloghtml01(year, expedition, txt):
|
||||
global logentries
|
||||
global logdataissues
|
||||
|
||||
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
||||
logbook_entry_count = 0
|
||||
for trippara in tripparas:
|
||||
logbook_entry_count += 1
|
||||
|
||||
s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
|
||||
assert s, trippara[:300]
|
||||
tripheader, triptext = s.group(1), s.group(2)
|
||||
@ -238,14 +300,27 @@ def Parseloghtml01(year, expedition, txt):
|
||||
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
||||
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
||||
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||
trippeople, expedition, tu, "html01", tripid)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
|
||||
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
||||
entry_type="html")
|
||||
|
||||
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
||||
"html01", tripid, logbook_entry_count)
|
||||
|
||||
# parser for 2003
|
||||
def Parseloghtml03(year, expedition, txt):
|
||||
global logentries
|
||||
global logdataissues
|
||||
|
||||
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
||||
logbook_entry_count = 0
|
||||
for trippara in tripparas:
|
||||
logbook_entry_count += 1
|
||||
|
||||
s = re.match("(?s)\s*<p>(.*?)</p>(.*)$", trippara)
|
||||
assert s, trippara
|
||||
tripheader, triptext = s.group(1), s.group(2)
|
||||
@ -256,7 +331,7 @@ def Parseloghtml03(year, expedition, txt):
|
||||
if re.match("T/U|Time underwater", sheader[-1]):
|
||||
tu = sheader.pop()
|
||||
if len(sheader) != 3:
|
||||
print(("header not three pieces", sheader))
|
||||
print((" ! Header not three pieces", sheader))
|
||||
tripdate, triptitle, trippeople = sheader
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
triptitles = triptitle.split(" , ")
|
||||
@ -268,10 +343,20 @@ def Parseloghtml03(year, expedition, txt):
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
|
||||
ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
|
||||
|
||||
tid= "n{}-s{:02d}".format(str(ldate),logbook_entry_count)
|
||||
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||
trippeople, expedition, tu, "html03", tid)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
|
||||
text = ltriptext, trippeople=trippeople, expedition=expedition,
|
||||
logtime_underground=0, entry_type="html")
|
||||
|
||||
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
||||
"html03", tid, logbook_entry_count)
|
||||
|
||||
|
||||
def SetDatesFromLogbookEntries(expedition):
|
||||
"""
|
||||
@ -292,15 +377,30 @@ def SetDatesFromLogbookEntries(expedition):
|
||||
persontrip.save()
|
||||
|
||||
|
||||
def LoadLogbookForExpedition(expedition,numentries):
|
||||
def LoadLogbookForExpedition(expedition,expect):
|
||||
""" Parses all logbook entries for one expedition
|
||||
If a cache is found it uses it. If not found, or fails sanity checks, parses source file.
|
||||
"""
|
||||
# absolutely horrid. REFACTOR THIS (all my fault..)
|
||||
global logentries
|
||||
logbook_parseable = False
|
||||
logbook_cached = False
|
||||
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
|
||||
expologbase = os.path.join(settings.EXPOWEB, "years")
|
||||
|
||||
logentries=[]
|
||||
|
||||
def validcache(year,n):
|
||||
if year != expedition:
|
||||
print(" ! year != expedition ",year, expedition )
|
||||
return False
|
||||
if len(logentries) != n:
|
||||
print(" ! len(logentries) != n ",len(logentries), n )
|
||||
return False
|
||||
if n != expect:
|
||||
print(" ! n != expect ",n, expect )
|
||||
return False
|
||||
return True
|
||||
|
||||
if expedition.year in yearlinks:
|
||||
logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
|
||||
parsefunc = yearlinks[expedition.year][1]
|
||||
@ -326,11 +426,15 @@ def LoadLogbookForExpedition(expedition,numentries):
|
||||
print(" - Reading cache: " + cache_filename, end='')
|
||||
try:
|
||||
with open(cache_filename, "rb") as f:
|
||||
logentries = pickle.load(f)
|
||||
print(" -- Loaded ", len(logentries), " log entries")
|
||||
logbook_cached = True
|
||||
year,n,logentries = pickle.load(f)
|
||||
if validcache(year,n):
|
||||
print(" -- Loaded ", len(logentries), " log entries")
|
||||
logbook_cached = True
|
||||
else:
|
||||
print(" !- Should be ", expect, " but ", len(logentries), " found in cache")
|
||||
raise
|
||||
except:
|
||||
print("\n ! Failed to load corrupt cache. Deleting it.\n")
|
||||
print(" ! Failed to load corrupt cache. Deleting it.")
|
||||
os.remove(cache_filename)
|
||||
logentries=[]
|
||||
raise
|
||||
@ -350,50 +454,75 @@ def LoadLogbookForExpedition(expedition,numentries):
|
||||
parser = globals()[parsefunc]
|
||||
parser(expedition.year, expedition, txt)
|
||||
SetDatesFromLogbookEntries(expedition)
|
||||
# and this has also stored all the log entries in logentries[]
|
||||
if len(logentries) >0:
|
||||
print(" - Cacheing " , len(logentries), " log entries")
|
||||
with open(cache_filename, "wb") as fc:
|
||||
pickle.dump(logentries, fc, 2)
|
||||
logbk=(expedition,len(logentries),logentries)
|
||||
pickle.dump(logbk, fc, protocol=4)
|
||||
else:
|
||||
print(" ! NO TRIP entries found in logbook, check the syntax.")
|
||||
|
||||
if logbook_cached:
|
||||
if logbook_cached: # working on this bit...
|
||||
i=0
|
||||
for entrytuple in range(len(logentries)):
|
||||
date, place, title, text, trippeople, expedition, logtime_underground, \
|
||||
entry_type = logentries[i]
|
||||
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
|
||||
entry_type)
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = logentries[i]
|
||||
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
|
||||
entry_type)
|
||||
EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground,
|
||||
entry_type, tripid1, i)
|
||||
i +=1
|
||||
SetDatesFromLogbookEntries(expedition)
|
||||
return len(logentries)
|
||||
|
||||
def LoadLogbooks():
|
||||
""" This is the master function for parsing all logbooks into the Troggle database.
|
||||
"""
|
||||
global logdataissues
|
||||
|
||||
logdataissues = {}
|
||||
DataIssue.objects.filter(parser='logbooks').delete()
|
||||
expos = Expedition.objects.all()
|
||||
if len(expos) <= 1:
|
||||
print(" ! No expeditions found. Load 'people' first.")
|
||||
print(" ! No expeditions found. Load 'people' first.\n")
|
||||
nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
|
||||
"1985","1986","1987","1988","1989","1990",]
|
||||
entries = {"2020": 0, "2019": 40, "2018": 148, "2017": 120, "2016": 162, "2015": 158,
|
||||
"2014": 130, "2013": 102, "2012": 150, "2011": 136, "2010": 44, "2009": 104,
|
||||
"2008": 98, "2007": 222, "2006": 48, "2005": 110, "2004": 152, "2003": 80, "2002": 62,
|
||||
"2001": 96, "2000": 108, "1999": 158, "1998": 86, "1997": 106, "1996": 188, "1995": 82,
|
||||
"1994": 64, "1993": 82, "1992": 122, "1991": 76, "1982": 76}
|
||||
entries = {"2020": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
|
||||
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
|
||||
"2008": 49, "2007": 111, "2006": 24, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
|
||||
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
|
||||
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1982": 0}
|
||||
try:
|
||||
os.remove("loadlogbk.log")
|
||||
except OSError:
|
||||
pass
|
||||
nlbe={}
|
||||
expd ={}
|
||||
with open("loadlogbk.log", "a") as log:
|
||||
for expo in expos:
|
||||
if expo.year not in nologbook:
|
||||
print((" - Logbook for: " + expo.year))
|
||||
numentries = LoadLogbookForExpedition(expo, entries[expo.year])
|
||||
log.write("{} {} should be {}\n".format(expo.year, numentries, entries[expo.year]))
|
||||
|
||||
|
||||
log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year]))
|
||||
nlbe[expo.year]=numentries
|
||||
expd[expo.year]= 0
|
||||
print("** total trips in ObjStore:", len(trips))
|
||||
for i in logdataissues:
|
||||
print("{:15s}: {}".format(i, logdataissues[i]))
|
||||
|
||||
for lbe in trips:
|
||||
year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe]
|
||||
expd[year] += 1
|
||||
yt=0
|
||||
for y in expd:
|
||||
print("{} {}".format(y, expd[y]), nlbe[y])
|
||||
yt += expd[y]
|
||||
print("{} total".format(yt))
|
||||
|
||||
with shelve.open('logbktrips.shelve',writeback=True) as odb:
|
||||
for lbe in trips:
|
||||
odb[lbe]=trips[lbe]
|
||||
odb.sync()
|
||||
odb.close()
|
||||
|
||||
dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
||||
expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
||||
|
Loading…
Reference in New Issue
Block a user