troggle-unchained/parsers/cavesM.py


import troggle.core.models as models #import models for various objects
from django.conf import settings
import xml.etree.ElementTree as ET #this is used to parse XML's
import subprocess
import re

#
#    This parser has to find several things:
#    There are files of .html format in expoweb area - they contain some of the important information
#    There is a similar number of .svx files in loser are - they contain all the measurements
#
#    Previous version was incredibly slow due to various shitty ideas about finding things 
#    and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell
#    and handles more sophisticated bits only
#

def load():
    print('Hi! I\'m caves parser. Ready to work')
    
    print('Loading caves of 1623 area')
    loadarea('1623')

    print('Loading caves of 1626 area')
    loadarea('1626')


def loadarea(areacode):

    if not file_exists(settings.SURVEX_DATA+'1623-and-1626.3d'):
        print('Computing master .3d file')
        bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')
    else:
        print('Loading from existing master .3d file')

    master3d = bash('dump3d -d '+settings.SURVEX_DATA+'1623-and-1626.3d').splitlines()
    master3dN = [x for x in master3d if ('NODE' in x)] #list of nodes of master survex file  
    master3dL = [x for x in master3d if ('LINE' in x)] #list of nodes of master survex file      

    print('Searching all cave dirs files')
    basedir = settings.SURVEX_DATA+'caves-'+areacode+'/'

    cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories
    print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')
    ndirs = len(cavedirs) #remember number of dirs for nice debug output

    for cavedir in cavedirs:
        if cavedir==basedir:
            continue #skip the basedir - a non-proper subdirectory
        cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory
        
        test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence
        if not file_exists(cavedir+'/'+cavename+'.svx'):
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')
            print('Cave missing'+cavename+' :(')
            msg.save()
            continue
        fullname=cavedir+'/'+cavename+'.svx'        
        print('Found cave:'+cavename)
        cavernout = bash('cavern -o '+cavedir+' '+fullname) #make cavern process the thing
        if 'cavern: error:' in cavernout:
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')
            print('Fucked svx'+cavename+' :(')
            msg.save()
            continue
        
        cavernout = cavernout.splitlines()
        depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])
        length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])
        cavefile = open(fullname,'r')
        cavefilecontents = cavefile.read().splitlines()
        surveyname = [x for x in cavefilecontents if ('*begin ') in x][0].split()[1].lower()          
        try:            
            title = [x for x in cavefilecontents if ('*title ') in x][0].split()[1]              
        except:
            syrveyname = "Untitled"

        relevant_nodes = [x for x in master3dN if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))] 
        entrance_nodes = [x for x in relevant_nodes if 'ENTRANCE' in x]
        surface_nodes = [x for x in relevant_nodes if 'SURFACE' in x]
        location_nodes = []
        print('rel_nodes'+str(len(relevant_nodes)))
        if len(entrance_nodes) > 0:
            location_nodes = entrance_nodes
        elif len(surface_nodes) > 0:
            location_nodes = surface_nodes
        elif len(relevant_nodes) > 0:
            location_nodes = relevant_nodes

        try:
            location = sorted(location_nodes, key = lambda y : float(y.split()[3])).pop()
        except:
            print(location_nodes)
            location = 'Not found'
        
        relevant_lines = [x for x in master3dL if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]
        try:
            lastleg = sorted(relevant_lines, key = lambda y : y.split().pop()).pop()
        except:
            lastleg = ['LINE 1900.01.01']
        try:        
            lastdate = lastleg.split().pop()
            if 'STYLE' in lastdate:
                lastdate = lastleg.split().pop().pop()
        except:
            lastdate = '1900.01.01'
        
        entrance = ' '.join(location.split()[1:3])
        print((('depth','length','surv name','entr','date'),(depth,length,surveyname,entrance,lastdate))) #sanity check print
                
        
        newcave =  models.CaveM(
            survex_file = fullname,
            total_length = length,
            name=areacode+'.'+surveyname,
            total_depth = depth,
            date = lastdate,
            entrance = entrance)
        newcave.save()
    #end of reading survex masterfiles
    
    print ("Reading cave descriptions")
    cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()
    for fn in cavefiles:
        f = open(fn, "r")
        print(fn)
        contents = f.read()    
        
        slug = re.sub(r"\s+", "", extractXML(contents,'caveslug'))
        desc = extractXML(contents,'underground_description')
        name = slug[5:] #get survex compatible name
        area = slug[0:4]
        
        print([area,name])
        
        if desc==None or name==None:
            msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')
            print('Fucked description '+fn+' :(')
            msg.save()
            continue

            print(area+'/'+name+'/'+name+'.svx')
        
        updatecave = models.CaveM.objects.filter(survex_file__icontains=area+'/'+name+'/'+name+'.svx')
        if len(updatecave)>1:
            print('Non unique solution - skipping. Name:'+name)
        elif len(updatecave)==0:
            print('Cave with no survex data:'+name)
            continue
        else: #exaclty one match
            print('Adding desc:'+name)
            updatecave = updatecave[0]
            updatecave.description = '/cave/descriptionM/'+slug #area-name
            updatecave.title=name
            updatecave.save()

            slugS = slug
            explorersS = extractXML(contents,'explorers')
            underground_descriptionS = extractXML(contents,'underground_description')
            equipmentS = extractXML(contents,'equipment')
            referencesS = extractXML(contents,'references')
            surveyS = extractXML(contents,'survey')
            kataster_statusS = extractXML(contents,'kataster_status')
            underground_centre_lineS = extractXML(contents,'underground_centre_line')
            survex_fileS = extractXML(contents,'survex_file')
            notesS = extractXML(contents,'notes')


            newcavedesc =  models.Cave_descriptionM(
            slug = slugS,            
            explorers = explorersS,
            underground_description = underground_descriptionS, 
            equipment = equipmentS, 
            references = referencesS, 
            survey = surveyS, 
            kataster_status = kataster_statusS, 
            underground_centre_line = underground_centre_lineS, 
            survex_file = survex_fileS, 
            notes = notesS)
            newcavedesc.save()
            

    #end of reading cave descriptions
    
def file_exists(filename):
    test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence
    if 'MISSING' in test: #send error message to the database
        return False
    return True       

def extractXML(contents,tag):
    #find correct lines
    lines = contents.splitlines()
    beg = [x for x in lines if ('<'+tag+'>' in x)]
    end = [x for x in lines if ('</'+tag+'>' in x)]
    if (not beg) or (not end):
        return None       
    begi = lines.index(beg[0])
    endi = lines.index(end[0])
    if endi!=begi:
        segment = '\n'.join(lines[begi:endi+1])
    else:
        segment = lines[begi:endi+1][0]

    hit = re.findall('<'+tag+'>(.*)</'+tag+'>', segment, re.S)[0]
    return hit

def bash(cmd): #calls command in bash shell, returns output
    process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
    output, error = process.communicate()
    return output
working on rebuilding everything 2019-02-27 22:29:45 +00:00
			`import troggle.core.models as models #import models for various objects`
			`from django.conf import settings`
			`import xml.etree.ElementTree as ET #this is used to parse XML's`
			`import subprocess`
			`import re`

			`#`
			`# This parser has to find several things:`
			`# There are files of .html format in expoweb area - they contain some of the important information`
			`# There is a similar number of .svx files in loser are - they contain all the measurements`
			`#`
			`# Previous version was incredibly slow due to various shitty ideas about finding things`
			`# and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell`
			`# and handles more sophisticated bits only`
			`#`

			`def load():`
			`print('Hi! I\'m caves parser. Ready to work')`

			`print('Loading caves of 1623 area')`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`loadarea('1623')`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
working maps: cave -> desc, survey -> cave, expedition -> person. Added /millnialpeople/ page. 2019-02-28 18:46:40 +00:00			`print('Loading caves of 1626 area')`
			`loadarea('1626')`

working on rebuilding everything 2019-02-27 22:29:45 +00:00
			`def loadarea(areacode):`

rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`if not file_exists(settings.SURVEX_DATA+'1623-and-1626.3d'):`
			`print('Computing master .3d file')`
			`bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')`
			`else:`
			`print('Loading from existing master .3d file')`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`master3d = bash('dump3d -d '+settings.SURVEX_DATA+'1623-and-1626.3d').splitlines()`
			`master3dN = [x for x in master3d if ('NODE' in x)] #list of nodes of master survex file`
			`master3dL = [x for x in master3d if ('LINE' in x)] #list of nodes of master survex file`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`print('Searching all cave dirs files')`
			`basedir = settings.SURVEX_DATA+'caves-'+areacode+'/'`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
			`cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories`
			`print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')`
			`ndirs = len(cavedirs) #remember number of dirs for nice debug output`

			`for cavedir in cavedirs:`
			`if cavedir==basedir:`
			`continue #skip the basedir - a non-proper subdirectory`
			`cavename = bash('echo '+cavedir+' \| rev \| cut -f1 -d \'/\' \| rev').splitlines()[0] #get final bit of the directory`

			`test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`if not file_exists(cavedir+'/'+cavename+'.svx'):`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')`
			`print('Cave missing'+cavename+' :(')`
			`msg.save()`
			`continue`
			`fullname=cavedir+'/'+cavename+'.svx'`
			`print('Found cave:'+cavename)`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`cavernout = bash('cavern -o '+cavedir+' '+fullname) #make cavern process the thing`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`if 'cavern: error:' in cavernout:`
			`msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')`
			`print('Fucked svx'+cavename+' :(')`
			`msg.save()`
			`continue`

			`cavernout = cavernout.splitlines()`
			`depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])`
			`length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`cavefile = open(fullname,'r')`
			`cavefilecontents = cavefile.read().splitlines()`
			`surveyname = [x for x in cavefilecontents if ('*begin ') in x][0].split()[1].lower()`
			`try:`
			`title = [x for x in cavefilecontents if ('*title ') in x][0].split()[1]`
			`except:`
			`syrveyname = "Untitled"`

			`relevant_nodes = [x for x in master3dN if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]`
			`entrance_nodes = [x for x in relevant_nodes if 'ENTRANCE' in x]`
			`surface_nodes = [x for x in relevant_nodes if 'SURFACE' in x]`
			`location_nodes = []`
			`print('rel_nodes'+str(len(relevant_nodes)))`
			`if len(entrance_nodes) > 0:`
			`location_nodes = entrance_nodes`
			`elif len(surface_nodes) > 0:`
			`location_nodes = surface_nodes`
			`elif len(relevant_nodes) > 0:`
			`location_nodes = relevant_nodes`

			`try:`
			`location = sorted(location_nodes, key = lambda y : float(y.split()[3])).pop()`
			`except:`
			`print(location_nodes)`
			`location = 'Not found'`

			`relevant_lines = [x for x in master3dL if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]`
			`try:`
			`lastleg = sorted(relevant_lines, key = lambda y : y.split().pop()).pop()`
			`except:`
			`lastleg = ['LINE 1900.01.01']`
			`try:`
			`lastdate = lastleg.split().pop()`
			`if 'STYLE' in lastdate:`
			`lastdate = lastleg.split().pop().pop()`
			`except:`
			`lastdate = '1900.01.01'`

			`entrance = ' '.join(location.split()[1:3])`
			`print((('depth','length','surv name','entr','date'),(depth,length,surveyname,entrance,lastdate))) #sanity check print`


			`newcave = models.CaveM(`
			`survex_file = fullname,`
			`total_length = length,`
			`name=areacode+'.'+surveyname,`
			`total_depth = depth,`
			`date = lastdate,`
			`entrance = entrance)`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`newcave.save()`
			`#end of reading survex masterfiles`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`print ("Reading cave descriptions")`
			`cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()`
			`for fn in cavefiles:`
			`f = open(fn, "r")`
			`print(fn)`
			`contents = f.read()`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00
			`slug = re.sub(r"\s+", "", extractXML(contents,'caveslug'))`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`desc = extractXML(contents,'underground_description')`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`name = slug[5:] #get survex compatible name`
			`area = slug[0:4]`

			`print([area,name])`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
			`if desc==None or name==None:`
			`msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')`
			`print('Fucked description '+fn+' :(')`
			`msg.save()`
			`continue`

rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`print(area+'/'+name+'/'+name+'.svx')`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`updatecave = models.CaveM.objects.filter(survex_file__icontains=area+'/'+name+'/'+name+'.svx')`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`if len(updatecave)>1:`
			`print('Non unique solution - skipping. Name:'+name)`
			`elif len(updatecave)==0:`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`print('Cave with no survex data:'+name)`
			`continue`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`else: #exaclty one match`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`print('Adding desc:'+name)`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`updatecave = updatecave[0]`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`updatecave.description = '/cave/descriptionM/'+slug #area-name`
working on rebuilding everything 2019-02-27 22:29:45 +00:00			`updatecave.title=name`
			`updatecave.save()`

rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`slugS = slug`
			`explorersS = extractXML(contents,'explorers')`
			`underground_descriptionS = extractXML(contents,'underground_description')`
			`equipmentS = extractXML(contents,'equipment')`
			`referencesS = extractXML(contents,'references')`
			`surveyS = extractXML(contents,'survey')`
			`kataster_statusS = extractXML(contents,'kataster_status')`
			`underground_centre_lineS = extractXML(contents,'underground_centre_line')`
			`survex_fileS = extractXML(contents,'survex_file')`
			`notesS = extractXML(contents,'notes')`


			`newcavedesc = models.Cave_descriptionM(`
			`slug = slugS,`
			`explorers = explorersS,`
			`underground_description = underground_descriptionS,`
			`equipment = equipmentS,`
			`references = referencesS,`
			`survey = surveyS,`
			`kataster_status = kataster_statusS,`
			`underground_centre_line = underground_centre_lineS,`
			`survex_file = survex_fileS,`
			`notes = notesS)`
			`newcavedesc.save()`




working on rebuilding everything 2019-02-27 22:29:45 +00:00			`#end of reading cave descriptions`

rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`def file_exists(filename):`
			`test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence`
			`if 'MISSING' in test: #send error message to the database`
			`return False`
			`return True`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
			`def extractXML(contents,tag):`
			`#find correct lines`
			`lines = contents.splitlines()`
			`beg = [x for x in lines if ('<'+tag+'>' in x)]`
			`end = [x for x in lines if ('</'+tag+'>' in x)]`
			`if (not beg) or (not end):`
			`return None`
			`begi = lines.index(beg[0])`
			`endi = lines.index(end[0])`
			`if endi!=begi:`
			`segment = '\n'.join(lines[begi:endi+1])`
			`else:`
rebuild descriptions database, some visuals 2019-02-28 12:36:49 +00:00			`segment = lines[begi:endi+1][0]`

			`hit = re.findall('<'+tag+'>(.*)</'+tag+'>', segment, re.S)[0]`
			`return hit`
working on rebuilding everything 2019-02-27 22:29:45 +00:00
			`def bash(cmd): #calls command in bash shell, returns output`
			`process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)`
			`output, error = process.communicate()`
			`return output`