2011-07-11 02:10:22 +01:00
import os
2020-05-28 02:20:50 +01:00
import re
2023-01-27 23:21:07 +00:00
import sys
import time
2023-01-19 18:33:04 +00:00
from datetime import date , datetime
2021-11-11 20:57:49 +00:00
from pathlib import Path
2023-01-19 18:33:04 +00:00
from random import randint
2011-07-11 02:10:22 +01:00
2020-05-28 02:20:50 +01:00
from django . conf import settings
from django . template . defaultfilters import slugify
2023-01-27 23:21:07 +00:00
from parsers . people import GetPersonExpeditionNameLookup , load_people_expos
2023-01-29 16:47:46 +00:00
from troggle . core . models . caves import GetCaveLookup
2023-01-30 16:18:19 +00:00
from troggle . core . models . logbooks import LogbookEntry , PersonLogEntry
2021-04-13 00:43:57 +01:00
from troggle . core . models . troggle import DataIssue , Expedition
2023-01-28 13:14:54 +00:00
from troggle . core . utils import get_process_memory
2011-07-11 02:10:22 +01:00
2023-01-19 21:18:42 +00:00
"""
2021-04-13 01:37:42 +01:00
Parses and imports logbooks in all their wonderful confusion
2022-12-19 11:38:34 +00:00
See detailed explanation of the complete process :
https : / / expo . survex . com / handbook / computing / logbooks - parsing . html
2023-01-19 21:18:42 +00:00
"""
todo = """
2023-01-28 13:14:54 +00:00
- Most of the time is during the database writing ( 6 s out of 8 s ) .
2023-01-28 10:47:25 +00:00
- this is a slow and uncertain function too : cave = getCaveByReference ( caveRef )
2022-08-30 15:58:49 +01:00
- profile the code to find bad repetitive things , of which there are many .
2023-01-27 17:24:31 +00:00
- attach or link a DataIssue to an individual expo ( logbook ) so that it can be found and deleted
2023-01-28 10:47:25 +00:00
- replace explicit 1970 date with a constant EPOCH
2023-02-02 21:50:40 +00:00
- rewrite to use generators rather than storing everything intermediate in lists - to
reduce memory impact [ low priority ]
2021-04-23 03:07:21 +01:00
- We should ensure logbook . html is utf - 8 and stop this crap :
file_in = open ( logbookfile , ' rb ' )
txt = file_in . read ( ) . decode ( " latin1 " )
2021-04-23 16:11:50 +01:00
2022-12-07 18:22:09 +00:00
- use Fixtures https : / / docs . djangoproject . com / en / 4.1 / ref / django - admin / #django-admin-loaddata to cache
2023-02-02 21:50:40 +00:00
data for old logbooks ? Not worth it . .
2023-01-19 21:18:42 +00:00
"""
2022-11-21 16:41:52 +00:00
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
2023-01-27 23:21:07 +00:00
BLOG_PARSER_SETTINGS = { # no default, must be explicit
2023-01-26 21:33:06 +00:00
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
2023-01-19 21:18:42 +00:00
}
2022-11-21 16:41:52 +00:00
DEFAULT_LOGBOOK_FILE = " logbook.html "
2022-12-16 19:57:56 +00:00
DEFAULT_LOGBOOK_PARSER = " parser_html "
2023-01-26 21:33:06 +00:00
# All years now (Jan.2023) use the default value for Logbook parser
# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
2023-01-19 21:18:42 +00:00
LOGBOOK_PARSER_SETTINGS = {
" 1982 " : ( " logbook.html " , " parser_html " ) ,
}
2023-01-27 23:21:07 +00:00
ENTRIES = {
2023-01-26 21:52:56 +00:00
" 2022 " : 90 ,
2023-01-19 21:18:42 +00:00
" 2019 " : 55 ,
" 2018 " : 95 ,
" 2017 " : 74 ,
" 2016 " : 86 ,
" 2015 " : 80 ,
2023-01-29 16:23:58 +00:00
" 2014 " : 67 ,
2023-01-19 21:18:42 +00:00
" 2013 " : 52 ,
2023-01-26 21:52:56 +00:00
" 2012 " : 76 ,
2023-01-19 21:18:42 +00:00
" 2011 " : 71 ,
" 2010 " : 22 ,
" 2009 " : 53 ,
" 2008 " : 49 ,
" 2007 " : 113 ,
" 2006 " : 60 ,
" 2005 " : 55 ,
" 2004 " : 76 ,
" 2003 " : 42 ,
" 2002 " : 31 ,
" 2001 " : 49 ,
" 2000 " : 54 ,
" 1999 " : 79 ,
" 1998 " : 43 ,
" 1997 " : 53 ,
" 1996 " : 95 ,
" 1995 " : 42 ,
" 1994 " : 32 ,
" 1993 " : 41 ,
" 1992 " : 62 ,
" 1991 " : 39 ,
" 1990 " : 87 ,
" 1989 " : 63 ,
" 1988 " : 61 ,
" 1987 " : 34 ,
" 1985 " : 24 ,
" 1984 " : 32 ,
" 1983 " : 52 ,
" 1982 " : 42 ,
}
logentries = [ ] # the entire logbook for one year is a single object: a list of entries
2023-01-27 23:21:07 +00:00
noncaveplaces = [ " travel " , " Journey " , " Loser Plateau " , " UNKNOWN " , " plateau " , " base camp " , " basecamp " , " top camp " , " topcamp " ]
2011-07-11 02:10:22 +01:00
2021-04-23 16:11:50 +01:00
def set_trip_id ( year , seq ) :
2023-01-19 21:18:42 +00:00
tid = f " { year } _s { seq : 02d } "
2021-04-23 16:11:50 +01:00
return tid
2023-01-19 21:18:42 +00:00
rx_tripperson = re . compile ( r " (?i)<u>(.*?)</u>$ " )
2022-11-18 20:42:03 +00:00
rx_round_bracket = re . compile ( r " [ \ ( \ [].*?[ \ ) \ ]] " )
2021-04-23 16:11:50 +01:00
def GetTripPersons ( trippeople , expedition , logtime_underground , tid = None ) :
2023-01-19 21:18:42 +00:00
res = [ ]
2011-07-11 02:10:22 +01:00
author = None
2022-12-09 23:45:07 +00:00
# print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ")
2019-03-06 23:20:34 +00:00
for tripperson in re . split ( r " ,| \ +|&|&(?! \ w+;)| and " , trippeople ) :
2011-07-11 02:10:22 +01:00
tripperson = tripperson . strip ( )
2023-02-02 15:40:50 +00:00
# author_u = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
author_u = rx_tripperson . match ( tripperson )
if author_u :
tripperson = author_u . group ( 1 ) . strip ( )
if tripperson :
if tripperson [ 0 ] != " * " : # a name prefix of "*" is special
tripperson = re . sub ( rx_round_bracket , " " , tripperson ) . strip ( )
# Whacky aliases all handled in GetPersonExpeditionNameLookup()
personyear = GetPersonExpeditionNameLookup ( expedition ) . get ( tripperson . lower ( ) )
if not personyear :
message = f " ! - { expedition . year } No name match for: ' { tripperson } ' in entry { tid =} for this expedition year. "
print ( message )
DataIssue . objects . create ( parser = " logbooks " , message = message )
res . append ( ( personyear , logtime_underground ) )
if author_u :
author = personyear
else :
# a person but with * prefix. Ignored everywhere.
2023-02-02 17:39:56 +00:00
# print(f" ! - {expedition.year} * person : {tripperson}")
pass
2023-02-02 15:40:50 +00:00
2011-07-11 02:10:22 +01:00
if not author :
if not res :
2022-12-17 03:02:08 +00:00
return " " , 0
2023-01-19 21:18:42 +00:00
author = res [ - 1 ] [ 0 ] # the previous valid person and a time of 0 hours
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
2011-07-11 02:10:22 +01:00
return res , author
2023-01-28 10:47:25 +00:00
def tidy_time_underground ( logtime_underground ) :
2022-12-18 19:33:56 +00:00
# Nasty hack, must tidy this up..
if logtime_underground :
try :
logtime_underground = float ( logtime_underground )
except :
# print(f"logtime_underground = {logtime_underground}")
tu_match = re . match ( r " (T/U: \ s*)?( \ d+[.]? \ d*).* " , logtime_underground )
if tu_match :
# print(f"logtime_underground = {tu_match.group(2)}")
logtime_underground = float ( tu_match . group ( 2 ) )
else :
logtime_underground = 0
else :
logtime_underground = 0
2023-01-28 10:47:25 +00:00
return logtime_underground
2022-12-18 19:33:56 +00:00
2023-02-02 15:40:50 +00:00
def tidy_trip_persons ( trippeople , title , expedition , logtime_underground , tid ) :
2021-04-23 11:43:25 +01:00
try :
trippersons , author = GetTripPersons ( trippeople , expedition , logtime_underground , tid = tid )
2022-12-18 19:33:56 +00:00
# print(f" - {author} - {logtime_underground}")
2021-04-23 11:43:25 +01:00
except :
2021-04-23 16:11:50 +01:00
message = f " ! - { expedition . year } Skipping logentry: { title } - GetTripPersons FAIL "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2021-04-23 16:11:50 +01:00
print ( message )
2023-01-28 10:47:25 +00:00
# raise
2021-04-23 11:43:25 +01:00
return
2023-01-19 21:18:42 +00:00
2011-07-11 02:10:22 +01:00
if not author :
2022-12-14 23:46:14 +00:00
message = f " ! - { expedition . year } Warning: logentry: { title } - no expo member author for entry ' { tid } ' "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2021-04-23 16:11:50 +01:00
print ( message )
2023-01-28 10:47:25 +00:00
return trippersons , author
2023-01-28 11:45:30 +00:00
def tidy_trip_cave ( place ) :
# GetCaveLookup() need to work better. None of this data is *used* though?
# 'tripcave' is converted to a string doing this, which renders as the cave slug.
2011-07-11 02:10:22 +01:00
lplace = place . lower ( )
2023-01-19 21:18:42 +00:00
cave = None
2011-07-11 02:10:22 +01:00
if lplace not in noncaveplaces :
2020-06-19 16:39:05 +01:00
cave = GetCaveLookup ( ) . get ( lplace )
2023-01-19 21:18:42 +00:00
2023-01-28 11:45:30 +00:00
return cave
def tidy_trip_image_urls ( text , date ) :
2021-05-02 15:50:20 +01:00
y = str ( date ) [ : 4 ]
2022-12-17 03:02:08 +00:00
2023-01-19 21:18:42 +00:00
text = text . replace ( ' src= " ' , f ' src= " /years/ { y } / ' )
text = text . replace ( " src= ' " , f " src= ' /years/ { y } / " )
2022-12-17 03:02:08 +00:00
2023-01-19 21:18:42 +00:00
text = text . replace ( f ' src= " /years/ { y } //years/ { y } / ' , f ' src= " /years/ { y } / ' )
text = text . replace ( f " src= ' /years/ { y } //years/ { y } / " , f " src= ' /years/ { y } / " )
2011-07-11 02:10:22 +01:00
2023-01-19 21:18:42 +00:00
text = text . replace ( " \t " , " " )
text = text . replace ( " \n \n \n " , " \n \n " )
2023-01-28 11:45:30 +00:00
return text
2023-01-28 13:14:54 +00:00
def tidy_tid ( tid , title ) :
if tid is not None :
return tid
# print(f"! {title=} ")
tid = str ( randint ( 1000 , 9999 ) ) + " _ " + slugify ( title ) [ : 10 ] . replace ( " - " , " _ " )
return tid
2023-01-28 11:45:30 +00:00
2023-01-28 13:14:54 +00:00
def store_entry_into_database ( date , place , tripcave , title , text , trippersons , author , expedition , logtime_underground , tid ) :
2023-01-30 16:42:56 +00:00
""" saves a single logbook entry and related personlogentry items
2023-01-31 01:37:00 +00:00
We could do a bulk update to save all the entries , but then we would need to do a query on
each one to get the primary key to asign to the PersonLogEntries . So overall probably not much
faster ?
2023-01-28 11:45:30 +00:00
"""
2023-01-19 21:18:42 +00:00
nonLookupAttribs = {
" place " : place ,
" text " : text ,
" expedition " : expedition ,
" time_underground " : logtime_underground ,
2023-01-28 11:45:30 +00:00
" cave_slug " : str ( tripcave ) ,
2023-01-28 13:14:54 +00:00
" slug " : tid ,
2023-01-19 21:18:42 +00:00
}
2023-01-28 13:14:54 +00:00
lookupAttribs = { " date " : date , " title " : title }
2023-01-28 11:45:30 +00:00
lbo = LogbookEntry . objects . create ( * * nonLookupAttribs , * * lookupAttribs )
2023-01-28 13:14:54 +00:00
2023-01-31 01:37:00 +00:00
pt_list = [ ]
2011-07-11 02:10:22 +01:00
for tripperson , time_underground in trippersons :
2023-01-31 01:37:00 +00:00
lookupAttribs = { " personexpedition " : tripperson , " logbook_entry " : lbo } # lbo is primary key
2023-01-19 21:18:42 +00:00
nonLookupAttribs = { " time_underground " : time_underground , " is_logbook_entry_author " : ( tripperson == author ) }
2023-01-31 01:37:00 +00:00
pt_list . append ( PersonLogEntry ( * * nonLookupAttribs , * * lookupAttribs ) )
PersonLogEntry . objects . bulk_create ( pt_list )
2023-01-27 23:21:07 +00:00
def parser_date ( tripdate , year ) :
2023-01-19 21:18:42 +00:00
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
2023-01-27 17:24:31 +00:00
dummydate = date ( 1970 , 1 , 1 ) # replace with _EPOCH
2022-08-25 13:54:00 +01:00
month = 1
day = 1
2022-09-21 22:22:09 +01:00
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
# print(message)
2021-04-23 16:11:50 +01:00
try :
mdatestandard = re . match ( r " ( \ d \ d \ d \ d)-( \ d \ d)-( \ d \ d) " , tripdate )
mdategoof = re . match ( r " ( \ d \ d?)/0?( \ d)/(20|19)?( \ d \ d) " , tripdate )
if mdatestandard :
if not ( mdatestandard . group ( 1 ) == year ) :
2022-07-08 23:30:49 +01:00
message = f " ! - Bad date (year) in logbook: { tripdate } - { year } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2022-07-15 13:11:49 +01:00
return dummydate
2021-04-23 16:11:50 +01:00
else :
year , month , day = int ( mdatestandard . group ( 1 ) ) , int ( mdatestandard . group ( 2 ) ) , int ( mdatestandard . group ( 3 ) )
elif mdategoof :
if not ( not mdategoof . group ( 3 ) or mdategoof . group ( 3 ) == year [ : 2 ] ) :
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof . group ( 3 )
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2022-07-15 13:11:49 +01:00
return dummydate
2021-04-23 16:11:50 +01:00
else :
yadd = int ( year [ : 2 ] ) * 100
day , month , year = int ( mdategoof . group ( 1 ) ) , int ( mdategoof . group ( 2 ) ) , int ( mdategoof . group ( 4 ) ) + yadd
2021-03-29 02:06:19 +01:00
else :
2023-01-27 17:24:31 +00:00
year = 1970 # replace with _EPOCH
2022-07-08 23:30:49 +01:00
message = f " ! - Bad date in logbook: { tripdate } - { year } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2021-04-23 16:11:50 +01:00
return date ( year , month , day )
except :
2022-07-08 23:30:49 +01:00
message = f " ! - Failed to parse date in logbook: { tripdate } - { year } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2023-01-27 17:24:31 +00:00
return datetime . date ( 1970 , 1 , 1 ) # replace with _EPOCH
2023-01-19 21:18:42 +00:00
2022-12-16 19:57:56 +00:00
def parser_html ( year , expedition , txt , seq = " " ) :
2023-01-19 21:18:42 +00:00
""" This uses some of the more obscure capabilities of regular expressions,
2022-12-19 11:38:34 +00:00
see https : / / docs . python . org / 3 / library / re . html
2023-01-19 21:18:42 +00:00
2022-12-21 02:05:26 +00:00
You can ' t see it here, but a round-trip export-then-import will move
2023-01-26 21:33:06 +00:00
the endmatter up to the frontmatter . This made sense when translating
2022-12-21 02:05:26 +00:00
from parser_html_01 format logfiles , believe me .
2023-01-19 21:18:42 +00:00
"""
2023-01-28 13:14:54 +00:00
logentries = [ ]
2023-01-27 23:21:07 +00:00
dupl = { }
2020-06-08 21:33:32 +01:00
2022-12-16 19:57:56 +00:00
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re . match ( r " (?i)(?s).*<body[^>]*>(.*?)<hr.* " , txt )
headpara = headmatch . groups ( ) [ 0 ] . strip ( )
2023-01-19 21:18:42 +00:00
2022-12-16 19:57:56 +00:00
# print(f" - headpara:\n'{headpara}'")
2023-01-19 21:18:42 +00:00
if len ( headpara ) > 0 :
2022-12-16 19:57:56 +00:00
frontpath = Path ( settings . EXPOWEB , " years " , year , " frontmatter.html " )
2023-01-19 21:18:42 +00:00
with open ( frontpath , " w " ) as front :
front . write ( headpara + " \n " )
2022-12-21 02:05:26 +00:00
# extract END material and stash for later use when rebuilding from list of entries
endmatch = re . match ( r " (?i)(?s).*<hr \ s*/>([ \ s \ S]*?)(?=</body) " , txt )
endpara = endmatch . groups ( ) [ 0 ] . strip ( )
2023-01-19 21:18:42 +00:00
2022-12-21 02:05:26 +00:00
# print(f" - endpara:\n'{endpara}'")
2023-01-19 21:18:42 +00:00
if len ( endpara ) > 0 :
2022-12-21 02:05:26 +00:00
endpath = Path ( settings . EXPOWEB , " years " , year , " endmatter.html " )
2023-01-19 21:18:42 +00:00
with open ( endpath , " w " ) as end :
end . write ( endpara + " \n " )
2019-03-06 23:20:34 +00:00
tripparas = re . findall ( r " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
logbook_entry_count = 0
2011-07-11 02:10:22 +01:00
for trippara in tripparas :
2019-03-06 23:20:34 +00:00
logbook_entry_count + = 1
2023-01-19 21:18:42 +00:00
tid = set_trip_id ( year , logbook_entry_count )
2022-03-24 01:16:43 +00:00
# print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
2023-01-19 21:18:42 +00:00
s = re . match (
r """ (?x)(?: \ s*<div \ sclass= " tripdate " \ sid= " .*? " >.*?</div> \ s*<p>)? # second date
2011-07-11 02:10:22 +01:00
\s * ( ? : < a \s + id = " (.*?) " \s * / > \s * < / a > ) ?
\s * < div \s + class = " tripdate " \s * ( ? : id = " (.*?) " ) ? > ( . * ? ) < / div > ( ? : < p > ) ?
\s * < div \s + class = " trippeople " > \s * ( . * ? ) < / div >
\s * < div \s + class = " triptitle " > \s * ( . * ? ) < / div >
( [ \s \S ] * ? )
\s * ( ? : < div \s + class = " timeug " > \s * ( . * ? ) < / div > ) ?
\s * $
2023-01-19 21:18:42 +00:00
""" ,
trippara ,
)
2022-08-25 14:12:13 +01:00
if s :
tripid , tripid1 , tripdate , trippeople , triptitle , triptext , tu = s . groups ( )
2023-01-19 21:18:42 +00:00
else : # allow title and people to be swapped in order
2022-12-16 19:57:56 +00:00
msg = f " !- { year } Can ' t parse: { logbook_entry_count } ' { trippara [ : 50 ] } ' ... "
2022-12-09 23:45:07 +00:00
print ( msg )
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = msg )
2022-12-09 23:45:07 +00:00
2023-01-19 21:18:42 +00:00
s2 = re . match (
r """ (?x)(?: \ s*<div \ sclass= " tripdate " \ sid= " .*? " >.*?</div> \ s*<p>)? # second date
2022-08-25 13:54:00 +01:00
\s * ( ? : < a \s + id = " (.*?) " \s * / > \s * < / a > ) ?
\s * < div \s + class = " tripdate " \s * ( ? : id = " (.*?) " ) ? > ( . * ? ) < / div > ( ? : < p > ) ?
\s * < div \s + class = " triptitle " > \s * ( . * ? ) < / div >
\s * < div \s + class = " trippeople " > \s * ( . * ? ) < / div >
( [ \s \S ] * ? )
\s * ( ? : < div \s + class = " timeug " > \s * ( . * ? ) < / div > ) ?
\s * $
2023-01-19 21:18:42 +00:00
""" ,
trippara ,
)
2022-08-25 14:12:13 +01:00
if s2 :
tripid , tripid1 , tripdate , triptitle , trippeople , triptext , tu = s2 . groups ( )
else :
2022-12-16 19:57:56 +00:00
# if not re.search(r"Rigging Guide", trippara):
msg = f " !- Logbook. Can ' t parse entry on 2nd pass: { logbook_entry_count } ' { trippara [ : 50 ] } ' ... "
print ( msg )
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = msg )
2022-08-25 14:12:13 +01:00
continue
2023-01-19 21:18:42 +00:00
2023-01-27 23:21:07 +00:00
ldate = parser_date ( tripdate . strip ( ) , year )
2011-07-11 02:10:22 +01:00
triptitles = triptitle . split ( " - " )
if len ( triptitles ) > = 2 :
2023-01-28 11:45:30 +00:00
place = triptitles [ 0 ]
2011-07-11 02:10:22 +01:00
else :
2023-01-28 11:45:30 +00:00
place = " UNKNOWN "
2023-01-28 13:14:54 +00:00
tripcontent = re . sub ( r " </p> " , " " , triptext )
tripcontent = re . sub ( r " <p> " , " <br /><br /> " , tripcontent ) . strip ( )
2023-01-19 21:18:42 +00:00
2022-12-18 20:36:11 +00:00
triptitle = triptitle . strip ( )
2023-01-27 23:21:07 +00:00
# triptitle must be unique for a given date. We fix this here.
check = ( ldate , triptitle )
if check in dupl :
dupl [ check ] + = 1
triptitle = f " { triptitle } # { dupl [ check ] } "
2023-01-29 16:23:58 +00:00
print ( f " - { triptitle } -- { ldate } " )
2023-01-27 23:21:07 +00:00
else :
dupl [ check ] = 1
2023-01-28 10:47:25 +00:00
tu = tidy_time_underground ( tu )
2023-02-02 15:40:50 +00:00
trippersons , author = tidy_trip_persons ( trippeople , triptitle , expedition , tu , tid )
2023-01-28 11:45:30 +00:00
tripcave = tidy_trip_cave ( place )
2023-01-29 16:23:58 +00:00
tripcontent = tidy_trip_image_urls ( tripcontent , ldate )
2023-01-28 13:14:54 +00:00
tid = tidy_tid ( tid , triptitle )
entrytuple = ( ldate , place , tripcave , triptitle , tripcontent , trippersons , author , expedition , tu , tid )
2020-06-08 21:33:32 +01:00
logentries . append ( entrytuple )
2023-01-28 13:14:54 +00:00
return logentries
2020-06-08 21:33:32 +01:00
2023-01-19 21:18:42 +00:00
2022-12-16 19:57:56 +00:00
def parser_blog ( year , expedition , txt , sq = " " ) :
2023-01-19 21:18:42 +00:00
""" Parses the format of web pages collected as ' Save As HTML " from the UK Caving blog website.
2022-12-14 23:46:14 +00:00
Note that the entries have dates and authors , but no titles .
2022-12-19 11:38:34 +00:00
See detailed explanation of the complete process :
https : / / expo . survex . com / handbook / computing / logbooks - parsing . html
https : / / expo . survex . com / handbook / computing / log - blog - parsing . html
2023-01-19 21:18:42 +00:00
2022-12-19 11:38:34 +00:00
This uses some of the more obscure capabilities of regular expressions ,
see https : / / docs . python . org / 3 / library / re . html
2023-01-19 21:18:42 +00:00
2022-12-19 20:13:26 +00:00
BLOG entries have this structure :
< article . . . data - author = " Tinywoman " data - content = " post-298780 " id = " js-post-298780 " >
< article class = " message-body js-selectToQuote " >
< / article >
2023-01-19 21:18:42 +00:00
< / article >
2022-12-19 20:13:26 +00:00
So the content is nested inside the header . Attachments ( images ) come after the content .
2023-01-27 23:21:07 +00:00
It ' s a bugger, but it ' s out of our control .
2023-01-19 21:18:42 +00:00
"""
2023-01-28 13:14:54 +00:00
logentries = [ ]
2020-06-08 21:33:32 +01:00
2023-01-19 21:18:42 +00:00
tripheads = re . findall (
r " <article class= \" message message--post js-post js-inlineModContainer \ s* \" \ s*([ \ s \ S]*?)(?=</article) " , txt
)
if not ( tripheads ) :
2022-12-14 23:46:14 +00:00
message = f " ! - Skipping on failure to parse article header: { txt [ : 500 ] } "
print ( message )
2020-06-08 21:33:32 +01:00
2022-12-19 11:38:34 +00:00
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
2023-01-19 21:18:42 +00:00
tripparas = re . findall (
r " <article class= \" message-body js-selectToQuote \" \ > \ s*([ \ s \ S]*?)(</article[^>]*>)([ \ s \ S]*?)(?=</article) " , txt
)
if not ( tripparas ) :
2022-12-14 23:46:14 +00:00
message = f " ! - Skipping on failure to parse article content: { txt [ : 500 ] } "
print ( message )
2023-01-19 21:18:42 +00:00
if len ( tripheads ) != len ( tripparas ) :
2022-12-14 23:46:14 +00:00
print ( f " { len ( tripheads ) } != { len ( tripparas ) } " )
2022-12-19 20:13:26 +00:00
print ( f " { len ( tripheads ) } - { len ( tripparas ) } " )
2022-12-14 23:46:14 +00:00
2023-01-19 21:18:42 +00:00
location = " Plateau " # best guess, fix manually later
2023-01-28 10:47:25 +00:00
tu = 0 # no logged time underground in a blog entry
2022-12-14 23:46:14 +00:00
logbook_entry_count = 0
for i in range ( 0 , len ( tripparas ) ) :
2022-12-19 20:13:26 +00:00
tripstuff = tripparas [ i ]
attach = tripstuff [ 2 ]
# note use on non-greedy *? regex idiom here
2023-01-19 21:18:42 +00:00
attach = re . sub ( r " <div class= \" file-content \" >[ \ s \ S]*?(?=</li>) " , " " , attach )
attach = re . sub ( r " <footer[ \ s \ S]*(</footer>) " , " " , attach )
2022-12-19 20:13:26 +00:00
tripcontent = tripstuff [ 0 ] + attach
2023-01-19 21:18:42 +00:00
# print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
2022-12-14 23:46:14 +00:00
triphead = tripheads [ i ]
logbook_entry_count + = 1
2023-01-19 21:18:42 +00:00
tid = set_trip_id ( year , logbook_entry_count ) + " _blog " + sq
2022-12-14 23:46:14 +00:00
# print(f" - tid: {tid}")
2023-01-19 21:18:42 +00:00
2022-12-14 23:46:14 +00:00
# data-author="tcacrossley"
match_author = re . search ( r " .*data-author= \" ([^ \" ]*) \" data-content=.* " , triphead )
2023-01-19 21:18:42 +00:00
if not ( match_author ) :
2022-12-14 23:46:14 +00:00
message = f " ! - Skipping logentry { year } : { logbook_entry_count } on failure to parse data-author { tid } { triphead [ : 400 ] } ... "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2022-12-14 23:46:14 +00:00
print ( message )
break
trippeople = match_author . group ( 1 )
# print(f" - tid: {tid} {trippeople}")
2022-12-16 19:57:56 +00:00
# datetime="2019-07-11T13:16:18+0100"
2022-12-14 23:46:14 +00:00
match_datetime = re . search ( r " .*datetime= \" ([^ \" ]*) \" data-time=.* " , triphead )
2023-01-19 21:18:42 +00:00
if not ( match_datetime ) :
2022-12-14 23:46:14 +00:00
message = f " ! - Skipping logentry { year } : { logbook_entry_count } on failure to parse datetime { tid } { triphead [ : 400 ] } ... "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2022-12-14 23:46:14 +00:00
print ( message )
break
datestamp = match_datetime . group ( 1 )
2023-01-19 21:18:42 +00:00
2022-12-15 01:06:54 +00:00
try :
tripdate = datetime . fromisoformat ( datestamp )
except :
2022-12-16 19:57:56 +00:00
message = f " ! - FROMISOFORMAT fail logentry { year } : { logbook_entry_count } { tid } ' { datestamp } ' "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2022-12-16 19:57:56 +00:00
print ( message )
# fallback, ignore the timestamp bits:
2022-12-15 01:06:54 +00:00
tripdate = datetime . fromisoformat ( datestamp [ 0 : 10 ] )
2022-12-17 03:02:08 +00:00
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
2023-01-19 21:18:42 +00:00
2023-01-27 23:21:07 +00:00
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
2023-01-28 11:45:30 +00:00
place = " Unknown "
2023-01-27 23:21:07 +00:00
# triptitle must be unique for a given date. We can enforce this here.
triptitle = f " Expo - UK Caving Blog { sq } post { logbook_entry_count } "
2023-01-19 21:18:42 +00:00
tripcontent = re . sub ( r " (width= \" \ d+ \" ) " , " " , tripcontent )
tripcontent = re . sub ( r " height= \" \ d+ \" " , " " , tripcontent )
tripcontent = re . sub ( r " width: \ d+px " , " " , tripcontent )
tripcontent = re . sub ( r " \ n \ n+ " , " \n \n " , tripcontent )
tripcontent = re . sub ( r " <hr \ s*> " , " " , tripcontent )
tripcontent = f " \n \n <!-- Content parsed from UK Caving Blog --> \n Blog Author: { trippeople } " + tripcontent
2023-01-28 10:47:25 +00:00
trippersons , author = tidy_trip_persons ( trippeople , expedition , logtime_underground , tid )
2023-01-28 11:45:30 +00:00
tripcave = tidy_trip_cave ( place )
2023-01-28 13:14:54 +00:00
tripcontent = tidy_trip_image_urls ( tripcontent , date )
tid = tidy_tid ( tid , triptitle )
2023-01-28 11:45:30 +00:00
entrytuple = ( tripdate , place , tripcave , triptitle , tripcontent , trippersons , author , expedition , tu , tid )
2022-12-14 23:46:14 +00:00
logentries . append ( entrytuple )
2023-01-28 13:14:54 +00:00
return logentries
2019-03-06 23:20:34 +00:00
2023-01-27 17:41:10 +00:00
def clean_all_logbooks ( ) :
DataIssue . objects . filter ( parser = " logbooks " ) . delete ( )
LogbookEntry . objects . all ( ) . delete ( )
2023-01-27 17:24:31 +00:00
def clean_logbook_for_expedition ( expedition ) :
2023-01-27 23:21:07 +00:00
""" Only used when loading a single logbook. Deletes database LogBookEntries and
DataIssues for this expedition year .
"""
lbes = LogbookEntry . objects . filter ( expedition = expedition ) . delete ( )
dataissues = DataIssue . objects . filter ( parser = " logbooks " )
for di in dataissues :
ph = expedition . year
if re . search ( ph , di . message ) is not None : # SLOW just to delete issues for one year
# print(f' - CLEANING dataissue {di.message}')
di . delete ( )
def parse_logbook_for_expedition ( expedition , blog = False ) :
2023-01-19 21:18:42 +00:00
""" Parses all logbook entries for one expedition
2020-05-30 20:31:20 +01:00
"""
2023-01-27 23:21:07 +00:00
global ENTRIES
logentries = [ ]
2019-03-06 23:20:34 +00:00
logbook_parseable = False
2023-01-27 23:21:07 +00:00
expologbase = Path ( settings . EXPOWEB , " years " )
2022-03-24 01:05:50 +00:00
year = expedition . year
2023-01-27 23:21:07 +00:00
expect = ENTRIES [ year ]
2022-03-24 01:05:50 +00:00
# print(" - Logbook for: " + year)
2023-01-27 23:21:07 +00:00
if year in LOGBOOK_PARSER_SETTINGS :
yearfile , parsefunc = LOGBOOK_PARSER_SETTINGS [ year ]
expedition . logbookfile = yearfile # don't change this if a blog
2020-05-30 20:31:20 +01:00
else :
2023-01-27 23:21:07 +00:00
yearfile = DEFAULT_LOGBOOK_FILE
expedition . logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
2023-01-19 21:18:42 +00:00
parsefunc = DEFAULT_LOGBOOK_PARSER
2023-01-27 23:21:07 +00:00
if blog :
print ( f " - BLOG file { yearfile } using parser { parsefunc } " )
if year not in BLOG_PARSER_SETTINGS :
message = f " ! - Expecting blog parser buut none specified for { year } "
DataIssue . objects . create ( parser = " logbooks " , message = message )
print ( message )
else :
yearfile , parsefunc = BLOG_PARSER_SETTINGS [ year ]
2020-05-30 20:31:20 +01:00
2023-01-27 23:21:07 +00:00
logbookpath = Path ( yearfile )
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
# expedition.save()
2023-01-19 21:18:42 +00:00
for sq in [ " " , " 2 " , " 3 " , " 4 " ] : # cope with blog saved as many separate files
lb = Path ( expologbase , year , logbookpath . stem + sq + logbookpath . suffix )
2022-12-16 19:57:56 +00:00
if not ( lb . is_file ( ) ) :
# print(f" ! End of blog. Next blog file in sequence not there:{lb}")
break
2022-08-30 15:58:49 +01:00
try :
2023-01-19 21:18:42 +00:00
with open ( lb , " rb " ) as file_in :
2022-12-16 19:57:56 +00:00
txt = file_in . read ( ) . decode ( " utf-8 " )
logbook_parseable = True
except ( IOError ) :
logbook_parseable = False
print ( f " ! Couldn ' t open logbook as UTF-8 { lb } " )
except :
logbook_parseable = False
print ( f " ! Very Bad Error opening { lb } " )
if logbook_parseable :
# --------------------
parser = globals ( ) [ parsefunc ]
2023-01-19 21:18:42 +00:00
print ( f " - { year } parsing with { parsefunc } - { lb } " )
2023-01-28 13:14:54 +00:00
logentries = parser ( year , expedition , txt , sq ) # this launches the right parser
2022-12-16 19:57:56 +00:00
# --------------------
2023-01-19 21:18:42 +00:00
2022-03-24 01:05:50 +00:00
if len ( logentries ) == expect :
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
pass
else :
2022-12-20 15:18:07 +00:00
print ( f " Mismatch in number of log entries: { year } { len ( logentries ) : 5d } is not { expect } \n " )
2022-03-24 01:05:50 +00:00
2023-01-28 13:14:54 +00:00
return logentries
2020-05-28 04:54:53 +01:00
2023-01-19 21:18:42 +00:00
2022-12-18 19:33:56 +00:00
def LoadLogbook ( year ) :
2023-01-27 23:21:07 +00:00
""" One off logbook for testing purposes, and also reloadable on ' /expedition/2022?reload '
This is inside an atomic transaction """
2023-01-19 21:18:42 +00:00
expo = Expedition . objects . get ( year = year )
year = expo . year # some type funny
2023-01-27 17:24:31 +00:00
clean_logbook_for_expedition ( expo )
2023-01-27 23:21:07 +00:00
logentries = [ ]
2023-01-27 17:41:10 +00:00
2023-01-28 13:14:54 +00:00
logentries = parse_logbook_for_expedition ( expo ) # this actually loads the logbook for one expo
2022-12-19 20:13:26 +00:00
if year in BLOG_PARSER_SETTINGS :
2023-01-28 13:14:54 +00:00
logentries + = parse_logbook_for_expedition ( expo , blog = True ) # this loads the blog logbook
2022-12-19 20:13:26 +00:00
else :
2023-01-19 21:18:42 +00:00
print (
2023-01-27 23:21:07 +00:00
f " - Not a year with extant blog entries to import: ' { year } ' not in BLOG_PARSER_SETTINGS { BLOG_PARSER_SETTINGS } "
2023-01-19 21:18:42 +00:00
)
2023-01-27 23:21:07 +00:00
for entrytuple in logentries :
2023-01-28 13:14:54 +00:00
date , place , tripcave , triptitle , text , trippersons , author , expedition , tu , tid = entrytuple
if expo == expedition : # unneeded check, we zeroed it before filling it
2023-01-27 23:21:07 +00:00
#print(f" - {triptitle}")
2023-01-28 13:14:54 +00:00
store_entry_into_database ( date , place , tripcave , triptitle , text , trippersons , author , expedition , tu , tid )
2023-01-28 11:45:30 +00:00
else :
2023-01-28 13:14:54 +00:00
print ( f " ! unexpected log entry labelled as ' { expedition } ' { tid } " )
expo . save ( ) # to save logbook name property
2023-01-27 23:21:07 +00:00
2011-07-11 02:10:22 +01:00
def LoadLogbooks ( ) :
2023-01-19 21:18:42 +00:00
""" This is the master function for parsing all logbooks into the Troggle database.
2022-11-21 16:26:30 +00:00
This should be rewritten to use coroutines to load all logbooks from disc in parallel ,
but must be serialised to write to database as sqlite is single - user .
2023-01-27 23:21:07 +00:00
This is inside an atomic transaction . Maybe it shouldn ' t be..
2020-05-30 20:31:20 +01:00
"""
2023-01-27 23:21:07 +00:00
global ENTRIES
global logentries
allentries = [ ]
mem1 = get_process_memory ( )
print ( f " - MEM: { mem1 : 7.2f } MB now " , file = sys . stderr )
start = time . time ( )
2020-06-08 21:33:32 +01:00
2023-01-27 17:41:10 +00:00
clean_all_logbooks ( )
2020-05-30 12:35:15 +01:00
expos = Expedition . objects . all ( )
2020-06-06 22:51:55 +01:00
if len ( expos ) < = 1 :
2023-01-27 23:21:07 +00:00
message = " ! - No expeditions found. Attempting to ' people ' first "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " logbooks " , message = message )
2022-03-18 10:21:25 +00:00
print ( message )
2023-01-27 23:21:07 +00:00
load_people_expos ( )
expos = Expedition . objects . all ( )
if len ( expos ) < = 1 :
message = " ! - No expeditions found, even after attempting to load ' people ' . Abort. "
DataIssue . objects . create ( parser = " logbooks " , message = message )
print ( message )
return
2022-03-18 10:21:25 +00:00
2023-01-19 21:18:42 +00:00
noexpo = [
" 1986 " ,
" 2020 " ,
" 2021 " ,
] # no expo
2022-03-02 21:15:24 +00:00
lostlogbook = [ " 1976 " , " 1977 " , " 1978 " , " 1979 " , " 1980 " , " 1981 " ]
2023-01-19 21:18:42 +00:00
sqlfail = [ " " ] # breaks mysql with db constraint fail - all now fixed.]
2022-03-02 21:15:24 +00:00
nologbook = noexpo + lostlogbook + sqlfail
2022-03-24 01:05:50 +00:00
2023-01-19 21:18:42 +00:00
nlbe = { }
2022-12-15 00:35:48 +00:00
loglist = [ ]
bloglist = [ ]
2023-01-19 21:18:42 +00:00
2023-01-27 23:21:07 +00:00
for expo in expos :
2022-03-24 01:05:50 +00:00
year = expo . year
if year in sqlfail :
print ( " - Logbook for: " + year + " NO parsing attempted - known sql failures " )
2023-01-19 21:18:42 +00:00
message = f " ! - Not even attempting to parse logbook for { year } until code fixed "
DataIssue . objects . create ( parser = " logbooks " , message = message )
2022-03-24 01:05:50 +00:00
print ( message )
2022-03-02 21:15:24 +00:00
2022-03-24 01:05:50 +00:00
if year not in nologbook :
2023-01-27 23:21:07 +00:00
if year in ENTRIES :
2022-12-15 00:35:48 +00:00
loglist . append ( expo )
2022-03-24 01:05:50 +00:00
else :
2023-01-19 21:18:42 +00:00
print ( " - No Logbook yet for: " + year ) # catch case when preparing for next expo
2022-12-16 19:57:56 +00:00
if year in BLOG_PARSER_SETTINGS :
2022-12-15 00:35:48 +00:00
bloglist . append ( expo )
2022-03-24 01:05:50 +00:00
2022-12-15 00:35:48 +00:00
for ex in loglist :
2023-01-28 13:14:54 +00:00
logentries = parse_logbook_for_expedition ( ex ) # this loads the logbook for one expo
2023-01-27 23:21:07 +00:00
allentries + = logentries
2022-12-15 00:35:48 +00:00
for b in bloglist :
2023-01-19 21:18:42 +00:00
print ( f " - BLOG: { b } " )
2023-01-28 13:14:54 +00:00
logentries = parse_logbook_for_expedition ( b , blog = True ) # loads the blog logbook for one expo
2023-01-27 23:21:07 +00:00
allentries + = logentries
2022-03-24 01:05:50 +00:00
2023-01-28 13:14:54 +00:00
print ( f " total { len ( allentries ) : , } log entries parsed in all expeditions " )
2023-01-27 23:21:07 +00:00
mem = get_process_memory ( )
print ( f " - MEM: { mem : 7.2f } MB in use, { mem - mem1 : 7.2f } MB more " , file = sys . stderr )
duration = time . time ( ) - start
print ( f " - TIME: { duration : 7.2f } s " , file = sys . stderr )
# Now we serially store the parsed data in the database, updating 3 types of object:
# - Expedition (the 'logbook.html' value)
# - LogBookEntry (text, who when etc.)
2023-01-30 16:18:19 +00:00
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
2023-01-27 23:21:07 +00:00
for entrytuple in allentries :
2023-01-28 13:14:54 +00:00
date , place , tripcave , triptitle , text , trippersons , author , expedition , tu , tid = entrytuple
store_entry_into_database ( date , place , tripcave , triptitle , text , trippersons , author , expedition , tu , tid )
2023-01-27 23:21:07 +00:00
for expo in expos :
2023-01-28 13:14:54 +00:00
expo . save ( ) # to save logbook name property
2023-01-27 23:21:07 +00:00
mem = get_process_memory ( )
print ( f " - MEM: { mem : 7.2f } MB in use, { mem - mem1 : 7.2f } MB more " , file = sys . stderr )
duration = time . time ( ) - start
print ( f " - TIME: { duration : 7.2f } s " , file = sys . stderr )
2021-04-23 16:11:50 +01:00
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)