2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-04-03 09:21:48 +01:00

Fixing wiki-parsing for 2009 logbook

This commit is contained in:
Philip Sargent 2022-12-18 19:33:56 +00:00
parent 73b710d53f
commit d1b94763b4
5 changed files with 74 additions and 65 deletions

View File

@ -453,10 +453,6 @@ class Entrance(TroggleModel):
class LogbookEntry(TroggleModel): class LogbookEntry(TroggleModel):
"""Single parsed entry from Logbook """Single parsed entry from Logbook
But what is all this__getattribute__ meta stuff for ? When is it needed ?!?
Le'ts get rid of it and set the 'cave' attribute to a cave object elsehwhere. This is
attempting to be Too Clever.
""" """
date = models.DateField()#MJG wants to turn this into a datetime such that multiple Logbook entries on the same day can be ordered.ld() date = models.DateField()#MJG wants to turn this into a datetime such that multiple Logbook entries on the same day can be ordered.ld()
expeditionday = models.ForeignKey("ExpeditionDay", null=True,on_delete=models.SET_NULL)#MJG wants to KILL THIS (redundant information) expeditionday = models.ForeignKey("ExpeditionDay", null=True,on_delete=models.SET_NULL)#MJG wants to KILL THIS (redundant information)
@ -466,13 +462,14 @@ class LogbookEntry(TroggleModel):
place = models.CharField(max_length=100,blank=True, null=True,help_text="Only use this if you haven't chosen a cave") place = models.CharField(max_length=100,blank=True, null=True,help_text="Only use this if you haven't chosen a cave")
text = models.TextField() text = models.TextField()
slug = models.SlugField(max_length=50) slug = models.SlugField(max_length=50)
time_underground = models.FloatField(null=True,help_text="In decimal hours")
class Meta: class Meta:
verbose_name_plural = "Logbook Entries" verbose_name_plural = "Logbook Entries"
# several PersonTrips point in to this object # several PersonTrips point in to this object
ordering = ('-date',) ordering = ('-date',)
def cave(self): # Why didn't he just make this a foreign key to Cave ? Replaces __getattrribute__ sillyness. def cave(self): # Why didn't he just make this a foreign key to Cave ?
c = CaveSlug.objects.get(slug=self.cave_slug, primary=True).cave c = CaveSlug.objects.get(slug=self.cave_slug, primary=True).cave
return c return c
@ -491,18 +488,6 @@ class LogbookEntry(TroggleModel):
def get_previous_by_id(self): def get_previous_by_id(self):
LogbookEntry.objects.get(id=self.id-1) LogbookEntry.objects.get(id=self.id-1)
# def new_QM_number(self):
# """Returns """
# if self.cave:
# nextQMnumber=self.cave.new_QM_number(self.date.year)
# else:
# return None
# return nextQMnumber
# def new_QM_found_link(self):
# """Produces a link to a new QM with the next number filled in and this LogbookEntry set as 'found by' """
# return settings.URL_ROOT + r'/admin/core/qm/add/?' + r'found_by=' + str(self.pk) +'&number=' + str(self.new_QM_number())
def DayIndex(self): def DayIndex(self):
return list(self.expeditionday.logbookentry_set.all()).index(self) return list(self.expeditionday.logbookentry_set.all()).index(self)

View File

@ -41,11 +41,11 @@ def import_logbooks():
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks() troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2019): def import_logbook(year=2009):
print(f"-- Importing Logbook {year}") print(f"-- Importing Logbook {year}")
print(f"-- - commented out") print(f"-- - commented out")
# with transaction.atomic(): with transaction.atomic():
# troggle.parsers.logbooks.LoadLogbook(year, format="cucc") troggle.parsers.logbooks.LoadLogbook(year)
def import_QMs(): def import_QMs():
print("-- Importing old QMs for 161, 204, 234 from CSV files") print("-- Importing old QMs for 161, 204, 234 from CSV files")

View File

@ -52,7 +52,7 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
''' '''
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = { BLOG_PARSER_SETTINGS = {
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
"2018": ("ukcavingblog.html", "parser_blog"), "2018": ("ukcavingblog.html", "parser_blog"),
"2019": ("ukcavingblog.html", "parser_blog"), "2019": ("ukcavingblog.html", "parser_blog"),
"2022": ("ukcavingblog.html", "parser_blog"), "2022": ("ukcavingblog.html", "parser_blog"),
@ -60,12 +60,13 @@ BLOG_PARSER_SETTINGS = {
DEFAULT_LOGBOOK_FILE = "logbook.html" DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html" DEFAULT_LOGBOOK_PARSER = "parser_html"
# All years since 2010 use the default value for Logbook parser # All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = { LOGBOOK_PARSER_SETTINGS = {
"2019": ("logbook.html", "parser_html"), "2019": ("logbook.html", "parser_html"),
"2010": ("logbook.html", "parser_html"), "2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"), # "2009": ("2009logbook.txt", "wiki_parser"), # converted to html
"2008": ("2008logbook.txt", "wiki_parser"), # "2008": ("2008logbook.txt", "wiki_parser"), # converted to html
"2009": ("logbook.html", "parser_html"),
"2008": ("logbook.html", "parser_html"),
"2007": ("logbook.html", "parser_html"), "2007": ("logbook.html", "parser_html"),
"2006": ("logbook.html", "parser_html"), "2006": ("logbook.html", "parser_html"),
# "2006": ("logbook/logbook_06.txt", "wiki_parser"), # converted to html # "2006": ("logbook/logbook_06.txt", "wiki_parser"), # converted to html
@ -96,15 +97,15 @@ LOGBOOK_PARSER_SETTINGS = {
} }
entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 76, "2016": 81, "2015": 79, entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 76, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1, "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
"1985": 24, "1984": 32, "1983": 52, "1982": 42,} "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
logentries = [] # the entire logbook for one year is a single object: a list of entries logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', noncaveplaces = [ "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
'base camp', 'basecamp', 'top camp', 'topcamp' ] 'base camp', 'basecamp', 'top camp', 'topcamp' ]
logdataissues = TROG['issues']['logdataissues'] logdataissues = TROG['issues']['logdataissues']
trips ={} trips ={}
@ -170,11 +171,30 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
""" saves a logbook entry and related persontrips """ saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
""" """
# Nasty hack, must tidy this up..
if logtime_underground:
try:
logtime_underground = float(logtime_underground)
except:
# print(f"logtime_underground = {logtime_underground}")
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
if tu_match:
# print(f"logtime_underground = {tu_match.group(2)}")
logtime_underground = float(tu_match.group(2))
else:
logtime_underground = 0
else:
logtime_underground = 0
try: try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# print(f" - {author} - {logtime_underground}")
except: except:
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL" message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser='logbooks', message=message) DataIssue.objects.create(parser='logbooks', message=message)
@ -223,11 +243,13 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
# slug = tid + "_" + slugify(title)[:10].replace('-','_') # slug = tid + "_" + slugify(title)[:10].replace('-','_')
else: else:
slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_') slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug} nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition,
'time_underground':logtime_underground, 'cave_slug':str(cave), 'slug': slug}
# This creates the lbo instance of LogbookEntry # This creates the lbo instance of LogbookEntry
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
# for PersonTrip time_underground is float (decimal hours)
for tripperson, time_underground in trippersons: for tripperson, time_underground in trippersons:
# print(f" - {tid} '{tripperson}' author:{tripperson == author}") # print(f" - {tid} '{tripperson}' author:{tripperson == author}")
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo} lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
@ -300,24 +322,29 @@ def wiki_parser(year, expedition, txt, seq=""):
else: else:
tripsplace = tripsplace[1] tripsplace = tripsplace[1]
#print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} '{tripsplace}'")
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) #tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
tul = re.findall(r"T/U:?\s*(\d+[.]?\d*)\s*(hr|hrs|hours)?.*", triptext)
if tul: if tul:
tu = tul[0][0] tu = tul[0][0]
else: else:
tu = "" tu = ""
print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} T/U:{tu} '{tripcave} - {tripsplace}' ")
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
tripid ="" tripid = set_trip_id(year,logbook_entry_count)
entrytuple = (ldate, tripcave, tripsplace, triptext, ltriptext = re.sub(r"\n", "<br /><br />\n", triptext)
ltriptext = ltriptext.replace("<br /><br />\n<br /><br />\n","<br /><br />\n")
triptitle = f'{tripcave} - {tripsplace}'
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, tripid) trippeople, expedition, tu, tripid)
logentries.append(entrytuple) logentries.append(entrytuple)
# 2002, 2004, 2005, 2007, 2010 - now # 2002, 2004 - now
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
def parser_html(year, expedition, txt, seq=""): def parser_html(year, expedition, txt, seq=""):
global logentries global logentries
global logdataissues global logdataissues
@ -382,7 +409,7 @@ def parser_html(year, expedition, txt, seq=""):
else: else:
tripcave = "UNKNOWN" tripcave = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip() ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
entrytuple = (ldate, tripcave, triptitle, ltriptext, entrytuple = (ldate, tripcave, triptitle, ltriptext,
@ -665,15 +692,17 @@ def LoadLogbookForExpedition(expedition, clean=True):
print(f' - {year} parsing with {parsefunc} - {lb}') print(f' - {year} parsing with {parsefunc} - {lb}')
parser(year, expedition, txt, sq) # this launches the right parser for this year parser(year, expedition, txt, sq) # this launches the right parser for this year
# -------------------- # --------------------
dupl = {}
for entrytuple in logentries: for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
try: check = (date, triptitle)
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple if check in dupl:
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022. dupl[check] += 1
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple triptitle = f"{triptitle} #{dupl[check]}"
print(f' - Exception entry_type "{entry_type}" {tripid1}') print(f' - {triptitle}')
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0, else:
dupl[check] = 1
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground,
tripid1) tripid1)
if len(logentries) == expect: if len(logentries) == expect:
@ -684,19 +713,16 @@ def LoadLogbookForExpedition(expedition, clean=True):
return len(logentries) return len(logentries)
# def LoadLogbook(year, format="cucc"): def LoadLogbook(year):
# global LOGBOOK_PARSER_SETTINGS '''One off logbook for testing purposes
'''
global LOGBOOK_PARSER_SETTINGS
# nlbe={} nlbe={}
# TROG['pagecache']['expedition'][year] = None # clear cache TROG['pagecache']['expedition'][year] = None # clear cache
# expo = Expedition.objects.get(year=year) expo = Expedition.objects.get(year=year)
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
# if (format=="blog"):
# LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)]
# # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
# nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
def LoadLogbooks(): def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """ This is the master function for parsing all logbooks into the Troggle database.
@ -721,8 +747,6 @@ def LoadLogbooks():
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first] sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
nologbook = noexpo + lostlogbook + sqlfail nologbook = noexpo + lostlogbook + sqlfail
# blogs = ["2019"]
nlbe={} nlbe={}
expd ={} expd ={}
loglist = [] loglist = []

View File

@ -68,10 +68,10 @@ an "S" for a survey trip. The colours are the same for people on the same trip.
<form action="" method="GET"><input type="submit" name="reload" value="Reload from logbook"></form> <form action="" method="GET"><input type="submit" name="reload" value="Reload from logbook"></form>
{% endif %} {% endif %}
<h3 id="trips">Logbooks and survey trips per day</h3> <h3 id="trips"> {{expedition.name}} - Records per day</h3>
<table class="expeditionlogbooks"> <table class="expeditionlogbooks">
<tr><th>Date</th><th>Logged trips</th><th>Surveys</th><th>Wallets</th></tr> <tr><th>Date</th><th>Logged trips and diary entries</th><th>Surveys</th><th>Wallets</th></tr>
{% regroup dateditems|dictsort:"date" by date as dates %} {% regroup dateditems|dictsort:"date" by date as dates %}
{% for date in dates %} {% for date in dates %}
<tr> <tr>
@ -89,5 +89,5 @@ an "S" for a survey trip. The colours are the same for people on the same trip.
{% endfor %}</td></tr> {% endfor %}</td></tr>
{% endfor %} {% endfor %}
</table> </table>
<h3> {{expedition.name}} </h3>
{% endblock %} {% endblock %}

View File

@ -20,11 +20,11 @@ Exported on {% now 'Y-m-d D' %} using control panel webpage and exportlogbook()
<hr /> <hr />
<div class="tripdate" id="{{logbook_entry.slug}}">{{logbook_entry.date|date:'Y-m-d'}}</div> <div class="tripdate" id="{{logbook_entry.slug}}">{{logbook_entry.date|date:'Y-m-d'}}</div>
<div class="trippeople">{% for persontrip in logbook_entry.persontrip_set.all %}{% if persontrip.is_logbook_entry_author %}<u>{{persontrip.personexpedition.person}}</u>{% else %}{{ persontrip.personexpedition.person }}{% endif %}, {% endfor %}</div> <div class="trippeople">{% for persontrip in logbook_entry.persontrip_set.all %}{% if persontrip.is_logbook_entry_author %}<u>{{persontrip.personexpedition.person|safe}}</u>{% else %}{{ persontrip.personexpedition.person|safe }}{% endif %}, {% endfor %}</div>
<div class="triptitle">{{logbook_entry.title}}</div> <div class="triptitle">{{logbook_entry.title|safe}}</div>
{{logbook_entry.text|safe}} {{logbook_entry.text|safe}}
<div class="timeug">T/U: {{logbook_entry.time_underground}}</div> <div class="timeug">T/U: {{logbook_entry.time_underground|safe}} hours</div>
{% endfor %} {% endfor %}
<hr /> <hr />
</body> </body>