2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-22 07:11:52 +00:00

Fixing wiki-parsing for 2009 logbook

This commit is contained in:
Philip Sargent 2022-12-18 19:33:56 +00:00
parent 73b710d53f
commit d1b94763b4
5 changed files with 74 additions and 65 deletions

View File

@ -453,10 +453,6 @@ class Entrance(TroggleModel):
class LogbookEntry(TroggleModel):
"""Single parsed entry from Logbook
But what is all this__getattribute__ meta stuff for ? When is it needed ?!?
Le'ts get rid of it and set the 'cave' attribute to a cave object elsehwhere. This is
attempting to be Too Clever.
"""
date = models.DateField()#MJG wants to turn this into a datetime such that multiple Logbook entries on the same day can be ordered.ld()
expeditionday = models.ForeignKey("ExpeditionDay", null=True,on_delete=models.SET_NULL)#MJG wants to KILL THIS (redundant information)
@ -466,13 +462,14 @@ class LogbookEntry(TroggleModel):
place = models.CharField(max_length=100,blank=True, null=True,help_text="Only use this if you haven't chosen a cave")
text = models.TextField()
slug = models.SlugField(max_length=50)
time_underground = models.FloatField(null=True,help_text="In decimal hours")
class Meta:
verbose_name_plural = "Logbook Entries"
# several PersonTrips point in to this object
ordering = ('-date',)
def cave(self): # Why didn't he just make this a foreign key to Cave ? Replaces __getattrribute__ sillyness.
def cave(self): # Why didn't he just make this a foreign key to Cave ?
c = CaveSlug.objects.get(slug=self.cave_slug, primary=True).cave
return c
@ -491,18 +488,6 @@ class LogbookEntry(TroggleModel):
def get_previous_by_id(self):
LogbookEntry.objects.get(id=self.id-1)
# def new_QM_number(self):
# """Returns """
# if self.cave:
# nextQMnumber=self.cave.new_QM_number(self.date.year)
# else:
# return None
# return nextQMnumber
# def new_QM_found_link(self):
# """Produces a link to a new QM with the next number filled in and this LogbookEntry set as 'found by' """
# return settings.URL_ROOT + r'/admin/core/qm/add/?' + r'found_by=' + str(self.pk) +'&number=' + str(self.new_QM_number())
def DayIndex(self):
return list(self.expeditionday.logbookentry_set.all()).index(self)

View File

@ -41,11 +41,11 @@ def import_logbooks():
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2019):
def import_logbook(year=2009):
print(f"-- Importing Logbook {year}")
print(f"-- - commented out")
# with transaction.atomic():
# troggle.parsers.logbooks.LoadLogbook(year, format="cucc")
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year)
def import_QMs():
print("-- Importing old QMs for 161, 204, 234 from CSV files")

View File

@ -52,7 +52,7 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
'''
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = {
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
"2018": ("ukcavingblog.html", "parser_blog"),
"2019": ("ukcavingblog.html", "parser_blog"),
"2022": ("ukcavingblog.html", "parser_blog"),
@ -60,12 +60,13 @@ BLOG_PARSER_SETTINGS = {
DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
# All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = {
"2019": ("logbook.html", "parser_html"),
"2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "wiki_parser"),
# "2009": ("2009logbook.txt", "wiki_parser"), # converted to html
# "2008": ("2008logbook.txt", "wiki_parser"), # converted to html
"2009": ("logbook.html", "parser_html"),
"2008": ("logbook.html", "parser_html"),
"2007": ("logbook.html", "parser_html"),
"2006": ("logbook.html", "parser_html"),
# "2006": ("logbook/logbook_06.txt", "wiki_parser"), # converted to html
@ -96,15 +97,15 @@ LOGBOOK_PARSER_SETTINGS = {
}
entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 76, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
noncaveplaces = [ "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
'base camp', 'basecamp', 'top camp', 'topcamp' ]
logdataissues = TROG['issues']['logdataissues']
trips ={}
@ -170,11 +171,30 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite
but we are saving the same thing too many times..
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
# Nasty hack, must tidy this up..
if logtime_underground:
try:
logtime_underground = float(logtime_underground)
except:
# print(f"logtime_underground = {logtime_underground}")
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
if tu_match:
# print(f"logtime_underground = {tu_match.group(2)}")
logtime_underground = float(tu_match.group(2))
else:
logtime_underground = 0
else:
logtime_underground = 0
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# print(f" - {author} - {logtime_underground}")
except:
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser='logbooks', message=message)
@ -223,11 +243,13 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
else:
slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition,
'time_underground':logtime_underground, 'cave_slug':str(cave), 'slug': slug}
# This creates the lbo instance of LogbookEntry
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
# for PersonTrip time_underground is float (decimal hours)
for tripperson, time_underground in trippersons:
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
@ -300,24 +322,29 @@ def wiki_parser(year, expedition, txt, seq=""):
else:
tripsplace = tripsplace[1]
#print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} '{tripsplace}'")
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
#tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
tul = re.findall(r"T/U:?\s*(\d+[.]?\d*)\s*(hr|hrs|hours)?.*", triptext)
if tul:
tu = tul[0][0]
else:
tu = ""
print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} T/U:{tu} '{tripcave} - {tripsplace}' ")
ldate = ParseDate(tripdate.strip(), year)
tripid =""
entrytuple = (ldate, tripcave, tripsplace, triptext,
tripid = set_trip_id(year,logbook_entry_count)
ltriptext = re.sub(r"\n", "<br /><br />\n", triptext)
ltriptext = ltriptext.replace("<br /><br />\n<br /><br />\n","<br /><br />\n")
triptitle = f'{tripcave} - {tripsplace}'
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, tripid)
logentries.append(entrytuple)
# 2002, 2004, 2005, 2007, 2010 - now
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
# 2002, 2004 - now
def parser_html(year, expedition, txt, seq=""):
global logentries
global logdataissues
@ -382,7 +409,7 @@ def parser_html(year, expedition, txt, seq=""):
else:
tripcave = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
#ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
entrytuple = (ldate, tripcave, triptitle, ltriptext,
@ -665,15 +692,17 @@ def LoadLogbookForExpedition(expedition, clean=True):
print(f' - {year} parsing with {parsefunc} - {lb}')
parser(year, expedition, txt, sq) # this launches the right parser for this year
# --------------------
dupl = {}
for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
try:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
print(f' - Exception entry_type "{entry_type}" {tripid1}')
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
check = (date, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f' - {triptitle}')
else:
dupl[check] = 1
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground,
tripid1)
if len(logentries) == expect:
@ -684,19 +713,16 @@ def LoadLogbookForExpedition(expedition, clean=True):
return len(logentries)
# def LoadLogbook(year, format="cucc"):
# global LOGBOOK_PARSER_SETTINGS
def LoadLogbook(year):
'''One off logbook for testing purposes
'''
global LOGBOOK_PARSER_SETTINGS
# nlbe={}
# TROG['pagecache']['expedition'][year] = None # clear cache
nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache
# expo = Expedition.objects.get(year=year)
# if (format=="blog"):
# LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)]
# # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
# nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
expo = Expedition.objects.get(year=year)
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database.
@ -721,8 +747,6 @@ def LoadLogbooks():
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
nologbook = noexpo + lostlogbook + sqlfail
# blogs = ["2019"]
nlbe={}
expd ={}
loglist = []

View File

@ -68,10 +68,10 @@ an "S" for a survey trip. The colours are the same for people on the same trip.
<form action="" method="GET"><input type="submit" name="reload" value="Reload from logbook"></form>
{% endif %}
<h3 id="trips">Logbooks and survey trips per day</h3>
<h3 id="trips"> {{expedition.name}} - Records per day</h3>
<table class="expeditionlogbooks">
<tr><th>Date</th><th>Logged trips</th><th>Surveys</th><th>Wallets</th></tr>
<tr><th>Date</th><th>Logged trips and diary entries</th><th>Surveys</th><th>Wallets</th></tr>
{% regroup dateditems|dictsort:"date" by date as dates %}
{% for date in dates %}
<tr>
@ -89,5 +89,5 @@ an "S" for a survey trip. The colours are the same for people on the same trip.
{% endfor %}</td></tr>
{% endfor %}
</table>
<h3> {{expedition.name}} </h3>
{% endblock %}

View File

@ -20,11 +20,11 @@ Exported on {% now 'Y-m-d D' %} using control panel webpage and exportlogbook()
<hr />
<div class="tripdate" id="{{logbook_entry.slug}}">{{logbook_entry.date|date:'Y-m-d'}}</div>
<div class="trippeople">{% for persontrip in logbook_entry.persontrip_set.all %}{% if persontrip.is_logbook_entry_author %}<u>{{persontrip.personexpedition.person}}</u>{% else %}{{ persontrip.personexpedition.person }}{% endif %}, {% endfor %}</div>
<div class="triptitle">{{logbook_entry.title}}</div>
<div class="trippeople">{% for persontrip in logbook_entry.persontrip_set.all %}{% if persontrip.is_logbook_entry_author %}<u>{{persontrip.personexpedition.person|safe}}</u>{% else %}{{ persontrip.personexpedition.person|safe }}{% endif %}, {% endfor %}</div>
<div class="triptitle">{{logbook_entry.title|safe}}</div>
{{logbook_entry.text|safe}}
<div class="timeug">T/U: {{logbook_entry.time_underground}}</div>
<div class="timeug">T/U: {{logbook_entry.time_underground|safe}} hours</div>
{% endfor %}
<hr />
</body>