mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2024-11-22 07:11:52 +00:00
Fixing wiki-parsing for 2009 logbook
This commit is contained in:
parent
73b710d53f
commit
d1b94763b4
@ -453,10 +453,6 @@ class Entrance(TroggleModel):
|
||||
|
||||
class LogbookEntry(TroggleModel):
|
||||
"""Single parsed entry from Logbook
|
||||
|
||||
But what is all this__getattribute__ meta stuff for ? When is it needed ?!?
|
||||
Le'ts get rid of it and set the 'cave' attribute to a cave object elsehwhere. This is
|
||||
attempting to be Too Clever.
|
||||
"""
|
||||
date = models.DateField()#MJG wants to turn this into a datetime such that multiple Logbook entries on the same day can be ordered.ld()
|
||||
expeditionday = models.ForeignKey("ExpeditionDay", null=True,on_delete=models.SET_NULL)#MJG wants to KILL THIS (redundant information)
|
||||
@ -466,13 +462,14 @@ class LogbookEntry(TroggleModel):
|
||||
place = models.CharField(max_length=100,blank=True, null=True,help_text="Only use this if you haven't chosen a cave")
|
||||
text = models.TextField()
|
||||
slug = models.SlugField(max_length=50)
|
||||
time_underground = models.FloatField(null=True,help_text="In decimal hours")
|
||||
|
||||
class Meta:
|
||||
verbose_name_plural = "Logbook Entries"
|
||||
# several PersonTrips point in to this object
|
||||
ordering = ('-date',)
|
||||
|
||||
def cave(self): # Why didn't he just make this a foreign key to Cave ? Replaces __getattrribute__ sillyness.
|
||||
def cave(self): # Why didn't he just make this a foreign key to Cave ?
|
||||
c = CaveSlug.objects.get(slug=self.cave_slug, primary=True).cave
|
||||
return c
|
||||
|
||||
@ -491,18 +488,6 @@ class LogbookEntry(TroggleModel):
|
||||
def get_previous_by_id(self):
|
||||
LogbookEntry.objects.get(id=self.id-1)
|
||||
|
||||
# def new_QM_number(self):
|
||||
# """Returns """
|
||||
# if self.cave:
|
||||
# nextQMnumber=self.cave.new_QM_number(self.date.year)
|
||||
# else:
|
||||
# return None
|
||||
# return nextQMnumber
|
||||
|
||||
# def new_QM_found_link(self):
|
||||
# """Produces a link to a new QM with the next number filled in and this LogbookEntry set as 'found by' """
|
||||
# return settings.URL_ROOT + r'/admin/core/qm/add/?' + r'found_by=' + str(self.pk) +'&number=' + str(self.new_QM_number())
|
||||
|
||||
def DayIndex(self):
|
||||
return list(self.expeditionday.logbookentry_set.all()).index(self)
|
||||
|
||||
|
@ -41,11 +41,11 @@ def import_logbooks():
|
||||
with transaction.atomic():
|
||||
troggle.parsers.logbooks.LoadLogbooks()
|
||||
|
||||
def import_logbook(year=2019):
|
||||
def import_logbook(year=2009):
|
||||
print(f"-- Importing Logbook {year}")
|
||||
print(f"-- - commented out")
|
||||
# with transaction.atomic():
|
||||
# troggle.parsers.logbooks.LoadLogbook(year, format="cucc")
|
||||
with transaction.atomic():
|
||||
troggle.parsers.logbooks.LoadLogbook(year)
|
||||
|
||||
def import_QMs():
|
||||
print("-- Importing old QMs for 161, 204, 234 from CSV files")
|
||||
|
@ -52,7 +52,7 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
|
||||
'''
|
||||
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||
BLOG_PARSER_SETTINGS = {
|
||||
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
"2018": ("ukcavingblog.html", "parser_blog"),
|
||||
"2019": ("ukcavingblog.html", "parser_blog"),
|
||||
"2022": ("ukcavingblog.html", "parser_blog"),
|
||||
@ -60,12 +60,13 @@ BLOG_PARSER_SETTINGS = {
|
||||
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
||||
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
||||
# All years since 2010 use the default value for Logbook parser
|
||||
# but several don't work, and are skipped by the parsing code, e.g. 1983
|
||||
LOGBOOK_PARSER_SETTINGS = {
|
||||
"2019": ("logbook.html", "parser_html"),
|
||||
"2010": ("logbook.html", "parser_html"),
|
||||
"2009": ("2009logbook.txt", "wiki_parser"),
|
||||
"2008": ("2008logbook.txt", "wiki_parser"),
|
||||
# "2009": ("2009logbook.txt", "wiki_parser"), # converted to html
|
||||
# "2008": ("2008logbook.txt", "wiki_parser"), # converted to html
|
||||
"2009": ("logbook.html", "parser_html"),
|
||||
"2008": ("logbook.html", "parser_html"),
|
||||
"2007": ("logbook.html", "parser_html"),
|
||||
"2006": ("logbook.html", "parser_html"),
|
||||
# "2006": ("logbook/logbook_06.txt", "wiki_parser"), # converted to html
|
||||
@ -96,15 +97,15 @@ LOGBOOK_PARSER_SETTINGS = {
|
||||
}
|
||||
|
||||
entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 76, "2016": 81, "2015": 79,
|
||||
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
|
||||
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
|
||||
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
|
||||
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
|
||||
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
|
||||
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
|
||||
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
|
||||
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
|
||||
|
||||
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
||||
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
|
||||
noncaveplaces = [ "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
|
||||
'base camp', 'basecamp', 'top camp', 'topcamp' ]
|
||||
logdataissues = TROG['issues']['logdataissues']
|
||||
trips ={}
|
||||
@ -170,11 +171,30 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
|
||||
""" saves a logbook entry and related persontrips
|
||||
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
||||
|
||||
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite
|
||||
but we are saving the same thing too many times..
|
||||
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
|
||||
|
||||
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
|
||||
lookupAttribs={'date':date, 'title':title}
|
||||
"""
|
||||
|
||||
# Nasty hack, must tidy this up..
|
||||
if logtime_underground:
|
||||
try:
|
||||
logtime_underground = float(logtime_underground)
|
||||
except:
|
||||
# print(f"logtime_underground = {logtime_underground}")
|
||||
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
|
||||
if tu_match:
|
||||
# print(f"logtime_underground = {tu_match.group(2)}")
|
||||
logtime_underground = float(tu_match.group(2))
|
||||
else:
|
||||
logtime_underground = 0
|
||||
else:
|
||||
logtime_underground = 0
|
||||
|
||||
try:
|
||||
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
||||
# print(f" - {author} - {logtime_underground}")
|
||||
except:
|
||||
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
@ -223,11 +243,13 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
|
||||
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
|
||||
else:
|
||||
slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
|
||||
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
|
||||
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition,
|
||||
'time_underground':logtime_underground, 'cave_slug':str(cave), 'slug': slug}
|
||||
|
||||
# This creates the lbo instance of LogbookEntry
|
||||
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
|
||||
|
||||
# for PersonTrip time_underground is float (decimal hours)
|
||||
for tripperson, time_underground in trippersons:
|
||||
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
|
||||
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
|
||||
@ -300,24 +322,29 @@ def wiki_parser(year, expedition, txt, seq=""):
|
||||
else:
|
||||
tripsplace = tripsplace[1]
|
||||
|
||||
#print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} '{tripsplace}'")
|
||||
|
||||
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
|
||||
#tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
|
||||
tul = re.findall(r"T/U:?\s*(\d+[.]?\d*)\s*(hr|hrs|hours)?.*", triptext)
|
||||
if tul:
|
||||
tu = tul[0][0]
|
||||
else:
|
||||
tu = ""
|
||||
print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} T/U:{tu} '{tripcave} - {tripsplace}' ")
|
||||
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
tripid =""
|
||||
|
||||
entrytuple = (ldate, tripcave, tripsplace, triptext,
|
||||
tripid = set_trip_id(year,logbook_entry_count)
|
||||
|
||||
ltriptext = re.sub(r"\n", "<br /><br />\n", triptext)
|
||||
ltriptext = ltriptext.replace("<br /><br />\n<br /><br />\n","<br /><br />\n")
|
||||
|
||||
triptitle = f'{tripcave} - {tripsplace}'
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||
trippeople, expedition, tu, tripid)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
|
||||
|
||||
# 2002, 2004, 2005, 2007, 2010 - now
|
||||
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
|
||||
# 2002, 2004 - now
|
||||
def parser_html(year, expedition, txt, seq=""):
|
||||
global logentries
|
||||
global logdataissues
|
||||
@ -382,7 +409,7 @@ def parser_html(year, expedition, txt, seq=""):
|
||||
else:
|
||||
tripcave = "UNKNOWN"
|
||||
ltriptext = re.sub(r"</p>", "", triptext)
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
#ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
||||
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||
@ -665,15 +692,17 @@ def LoadLogbookForExpedition(expedition, clean=True):
|
||||
print(f' - {year} parsing with {parsefunc} - {lb}')
|
||||
parser(year, expedition, txt, sq) # this launches the right parser for this year
|
||||
# --------------------
|
||||
|
||||
dupl = {}
|
||||
for entrytuple in logentries:
|
||||
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
try:
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
|
||||
print(f' - Exception entry_type "{entry_type}" {tripid1}')
|
||||
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
check = (date, triptitle)
|
||||
if check in dupl:
|
||||
dupl[check] += 1
|
||||
triptitle = f"{triptitle} #{dupl[check]}"
|
||||
print(f' - {triptitle}')
|
||||
else:
|
||||
dupl[check] = 1
|
||||
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground,
|
||||
tripid1)
|
||||
|
||||
if len(logentries) == expect:
|
||||
@ -684,19 +713,16 @@ def LoadLogbookForExpedition(expedition, clean=True):
|
||||
|
||||
return len(logentries)
|
||||
|
||||
# def LoadLogbook(year, format="cucc"):
|
||||
# global LOGBOOK_PARSER_SETTINGS
|
||||
def LoadLogbook(year):
|
||||
'''One off logbook for testing purposes
|
||||
'''
|
||||
global LOGBOOK_PARSER_SETTINGS
|
||||
|
||||
# nlbe={}
|
||||
# TROG['pagecache']['expedition'][year] = None # clear cache
|
||||
nlbe={}
|
||||
TROG['pagecache']['expedition'][year] = None # clear cache
|
||||
|
||||
# expo = Expedition.objects.get(year=year)
|
||||
|
||||
# if (format=="blog"):
|
||||
# LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)]
|
||||
# # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
|
||||
|
||||
# nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
||||
expo = Expedition.objects.get(year=year)
|
||||
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
||||
|
||||
def LoadLogbooks():
|
||||
""" This is the master function for parsing all logbooks into the Troggle database.
|
||||
@ -721,8 +747,6 @@ def LoadLogbooks():
|
||||
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
|
||||
nologbook = noexpo + lostlogbook + sqlfail
|
||||
|
||||
# blogs = ["2019"]
|
||||
|
||||
nlbe={}
|
||||
expd ={}
|
||||
loglist = []
|
||||
|
@ -68,10 +68,10 @@ an "S" for a survey trip. The colours are the same for people on the same trip.
|
||||
<form action="" method="GET"><input type="submit" name="reload" value="Reload from logbook"></form>
|
||||
{% endif %}
|
||||
|
||||
<h3 id="trips">Logbooks and survey trips per day</h3>
|
||||
<h3 id="trips"> {{expedition.name}} - Records per day</h3>
|
||||
|
||||
<table class="expeditionlogbooks">
|
||||
<tr><th>Date</th><th>Logged trips</th><th>Surveys</th><th>Wallets</th></tr>
|
||||
<tr><th>Date</th><th>Logged trips and diary entries</th><th>Surveys</th><th>Wallets</th></tr>
|
||||
{% regroup dateditems|dictsort:"date" by date as dates %}
|
||||
{% for date in dates %}
|
||||
<tr>
|
||||
@ -89,5 +89,5 @@ an "S" for a survey trip. The colours are the same for people on the same trip.
|
||||
{% endfor %}</td></tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
|
||||
<h3> {{expedition.name}} </h3>
|
||||
{% endblock %}
|
||||
|
@ -20,11 +20,11 @@ Exported on {% now 'Y-m-d D' %} using control panel webpage and exportlogbook()
|
||||
<hr />
|
||||
|
||||
<div class="tripdate" id="{{logbook_entry.slug}}">{{logbook_entry.date|date:'Y-m-d'}}</div>
|
||||
<div class="trippeople">{% for persontrip in logbook_entry.persontrip_set.all %}{% if persontrip.is_logbook_entry_author %}<u>{{persontrip.personexpedition.person}}</u>{% else %}{{ persontrip.personexpedition.person }}{% endif %}, {% endfor %}</div>
|
||||
<div class="triptitle">{{logbook_entry.title}}</div>
|
||||
<div class="trippeople">{% for persontrip in logbook_entry.persontrip_set.all %}{% if persontrip.is_logbook_entry_author %}<u>{{persontrip.personexpedition.person|safe}}</u>{% else %}{{ persontrip.personexpedition.person|safe }}{% endif %}, {% endfor %}</div>
|
||||
<div class="triptitle">{{logbook_entry.title|safe}}</div>
|
||||
|
||||
{{logbook_entry.text|safe}}
|
||||
<div class="timeug">T/U: {{logbook_entry.time_underground}}</div>
|
||||
<div class="timeug">T/U: {{logbook_entry.time_underground|safe}} hours</div>
|
||||
{% endfor %}
|
||||
<hr />
|
||||
</body>
|
||||
|
Loading…
Reference in New Issue
Block a user