mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2025-12-13 20:17:05 +00:00
Importing old logbooks
This commit is contained in:
@@ -37,9 +37,10 @@ def GetTripPersons(trippeople, expedition, logtime_underground):
|
||||
tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
|
||||
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
|
||||
if not personyear:
|
||||
print((" - No name match for: '%s'" % tripperson))
|
||||
message = "No name match for: '%s' in year '%s'" % (tripperson, expedition.year)
|
||||
message = "No name match for: ||'%s'|| in year '%s'" % (tripperson, expedition.year)
|
||||
print(message)
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
logdataissues[expedition.year + "~" + tripperson]=message
|
||||
res.append((personyear, logtime_underground))
|
||||
if mul:
|
||||
author = personyear
|
||||
@@ -91,6 +92,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
|
||||
print(" ! - Skipping logentry: " + title + " - no author for entry")
|
||||
message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
logdataissues["title"]=message
|
||||
return
|
||||
|
||||
# This needs attention. The slug field is derived from 'title'
|
||||
@@ -133,7 +135,7 @@ def ParseDate(tripdate, year):
|
||||
else:
|
||||
message = " ! - Bad date in logbook: " + tripdate + " - " + year
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
logdataissues["author"]=message
|
||||
logdataissues["tripdate"]=message
|
||||
assert False, tripdate
|
||||
|
||||
return datetime.date(year, month, day)
|
||||
@@ -254,57 +256,77 @@ def Parseloghtmltxt(year, expedition, txt):
|
||||
"html", tripid1, logbook_entry_count)
|
||||
|
||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
|
||||
def Parseloghtml01(year, expedition, txt):
|
||||
global logentries
|
||||
global logdataissues
|
||||
errorcount = 0
|
||||
|
||||
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
||||
logbook_entry_count = 0
|
||||
for trippara in tripparas:
|
||||
logbook_entry_count += 1
|
||||
try:
|
||||
tripentry = year + "." + str(logbook_entry_count)
|
||||
s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
|
||||
if not s:
|
||||
message = " ! - Skipping logentry on failure to parse header: " + tripentry + trippara[:300] + "..."
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
logdataissues[tripentry]=message
|
||||
print(message)
|
||||
break
|
||||
tripheader, triptext = s.group(1), s.group(2)
|
||||
mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
||||
tripid = mtripid and mtripid.group(1) or ""
|
||||
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
||||
|
||||
tripdate, triptitle, trippeople = tripheader.split("|")
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
|
||||
s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
|
||||
assert s, trippara[:300]
|
||||
tripheader, triptext = s.group(1), s.group(2)
|
||||
mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
||||
tripid = mtripid and mtripid.group(1) or ""
|
||||
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
||||
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
|
||||
if mtu:
|
||||
tu = mtu.group(1)
|
||||
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
|
||||
else:
|
||||
tu = ""
|
||||
|
||||
tripdate, triptitle, trippeople = tripheader.split("|")
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
|
||||
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
|
||||
if mtu:
|
||||
tu = mtu.group(1)
|
||||
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
|
||||
else:
|
||||
tu = ""
|
||||
triptitles = triptitle.split(" - ")
|
||||
tripcave = triptitles[0].strip()
|
||||
|
||||
triptitles = triptitle.split(" - ")
|
||||
tripcave = triptitles[0].strip()
|
||||
ltriptext = triptext
|
||||
|
||||
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
||||
if mtail:
|
||||
ltriptext = ltriptext[:mtail.start(0)]
|
||||
ltriptext = re.sub(r"</p>", "", ltriptext)
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
|
||||
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
||||
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
||||
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
||||
|
||||
ltriptext = triptext
|
||||
|
||||
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
||||
if mtail:
|
||||
ltriptext = ltriptext[:mtail.start(0)]
|
||||
ltriptext = re.sub(r"</p>", "", ltriptext)
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
|
||||
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
||||
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
||||
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||
trippeople, expedition, tu, "html01", tripid)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||
trippeople, expedition, tu, "html01", tripid)
|
||||
logentries.append(entrytuple)
|
||||
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
|
||||
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
||||
entry_type="html")
|
||||
|
||||
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
|
||||
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
||||
entry_type="html")
|
||||
|
||||
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
||||
"html01", tripid, logbook_entry_count)
|
||||
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
||||
"html01", tripid, logbook_entry_count)
|
||||
except:
|
||||
message = " ! - Skipping logentry due to exception in: " + tripentry
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
logdataissues[tripentry]=message
|
||||
print(message)
|
||||
errorcount += 1
|
||||
if errorcount >5 :
|
||||
message = " !!- TOO MANY ERRORS - aborting logbook: " + year
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
logdataissues[tripentry]=message
|
||||
print(message)
|
||||
return
|
||||
|
||||
# parser for 2003
|
||||
def Parseloghtml03(year, expedition, txt):
|
||||
@@ -473,6 +495,8 @@ def LoadLogbookForExpedition(expedition,expect):
|
||||
|
||||
def LoadLogbooks():
|
||||
""" This is the master function for parsing all logbooks into the Troggle database.
|
||||
Parser settings appropriate for each year are set in settings.py LOGBOOK_PARSER_SETTINGS.
|
||||
This should be rewritten to use coroutines to load all logbooks from disc in parallel.
|
||||
"""
|
||||
global logdataissues
|
||||
|
||||
@@ -481,13 +505,14 @@ def LoadLogbooks():
|
||||
expos = Expedition.objects.all()
|
||||
if len(expos) <= 1:
|
||||
print(" ! No expeditions found. Load 'people' first.\n")
|
||||
nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
|
||||
"1985","1986","1987","1988","1989","1990",]
|
||||
entries = {"2020": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
|
||||
nologbook = ["1976", "1977", "1978", "1979", "1980", "1981", "1986", "2020",]
|
||||
entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
|
||||
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
|
||||
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
|
||||
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
|
||||
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1982": 0}
|
||||
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
|
||||
"1985": 1,"1984": 1,"1983": 1,"1982": 42,}
|
||||
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
|
||||
try:
|
||||
os.remove("loadlogbk.log")
|
||||
except OSError:
|
||||
@@ -503,8 +528,8 @@ def LoadLogbooks():
|
||||
nlbe[expo.year]=numentries
|
||||
expd[expo.year]= 0
|
||||
print("** total trips in ObjStore:", len(trips))
|
||||
for i in logdataissues:
|
||||
print("{:15s}: {}".format(i, logdataissues[i]))
|
||||
#for i in logdataissues:
|
||||
# print("{:15s}: {}".format(i, logdataissues[i]))
|
||||
|
||||
for lbe in trips:
|
||||
year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe]
|
||||
@@ -513,7 +538,7 @@ def LoadLogbooks():
|
||||
for y in expd:
|
||||
print("{} {}".format(y, expd[y]), nlbe[y])
|
||||
yt += expd[y]
|
||||
print("{} total".format(yt))
|
||||
print("total {} log entries in all expeditions".format(yt))
|
||||
|
||||
with shelve.open('logbktrips.shelve',writeback=True) as odb:
|
||||
for lbe in trips:
|
||||
|
||||
Reference in New Issue
Block a user