troggle-unchained/parsers/cavesM.py


import troggle.core.models as models #import models for various objects
from django.conf import settings
import xml.etree.ElementTree as ET #this is used to parse XML's
import subprocess
import re

#
#    This parser has to find several things:
#    There are files of .html format in expoweb area - they contain some of the important information
#    There is a similar number of .svx files in loser are - they contain all the measurements
#
#    Previous version was incredibly slow due to various shitty ideas about finding things 
#    and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell
#    and handles more sophisticated bits only
#

def load():
    print('Hi! I\'m caves parser. Ready to work')
    
    print('Loading caves of 1623 area')
    loadarea('caves-1623/')


def loadarea(areacode):


    print('Searching all cave dirs files')
    basedir = settings.SURVEX_DATA+areacode

    bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')

    cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories
    print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')
    ndirs = len(cavedirs) #remember number of dirs for nice debug output

    for cavedir in cavedirs:
        if cavedir==basedir:
            continue #skip the basedir - a non-proper subdirectory
        cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory
        
        test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence
        if 'MISSING' in test: #send error message to the database
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')
            print('Cave missing'+cavename+' :(')
            msg.save()
            continue
        fullname=cavedir+'/'+cavename+'.svx'        
        print('Found cave:'+cavename)
        cavernout = bash('cavern -q '+fullname) #make cavern process the thing
        if 'cavern: error:' in cavernout:
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')
            print('Fucked svx'+cavename+' :(')
            msg.save()
            continue
        
        cavernout = cavernout.splitlines()
        depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])
        length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])
        surveyname = bash('cat '+fullname+' | grep \'\*begin\' | head -n1 | cut -f2 -d \' \' ').splitlines().pop()
        title = (bash('cat '+fullname+' | grep \'\*title\' | head -n1 | cut -f2 -d \' \' ').splitlines() or ["Not found"])[0]      
        print((('depth','length','surv name'),(depth,length,surveyname)))
        print('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d | grep NODE | grep \'\\[\\.'+surveyname+'.*\\]\'')        
        nodes = bash('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d | grep NODE | grep \'\\[.*\\.'+surveyname+'.*\\]\'').splitlines()
        entran = [x for x in nodes if ('ENTRANCE' in x) ]
        print(nodes)


        newcave =  models.CaveM(survex_file = fullname, total_length = length, name=title, total_depth = depth)
        newcave.save()
    #end of reading survex masterfiles

    print ("Reading cave descriptions")
    cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()
    for fn in cavefiles:
        f = open(fn, "r")
        print(fn)
        contents = f.read()    

        desc = extractXML(contents,'underground_description')
        name = re.search(r'>.*<',extractXML(contents,'caveslug')).group()[6:-1]
        
        if desc==None or name==None:
            msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')
            print('Fucked description '+fn+' :(')
            msg.save()
            continue

        
        updatecave = models.CaveM.objects.filter(survex_file__icontains='/'+name+'.svx')
        if len(updatecave)>1:
            print('Non unique solution - skipping. Name:'+name)
        elif len(updatecave)==0:
            print('Cave with no survex data'+name)
            newcave =  models.CaveM(description = desc, name = name)
            newcave.save()
        else: #exaclty one match
            updatecave = updatecave[0]
            updatecave.description = desc
            if updatecave.name=="Not found":
                updatecave.name=name
            updatecave.title=name
            updatecave.save()
        

    #end of reading cave descriptions
    
        
def extractXML(contents,tag):
    #find correct lines
    lines = contents.splitlines()
    beg = [x for x in lines if ('<'+tag+'>' in x)]
    end = [x for x in lines if ('</'+tag+'>' in x)]
    if (not beg) or (not end):
        return None       
    begi = lines.index(beg[0])
    endi = lines.index(end[0])
    if endi!=begi:
        segment = '\n'.join(lines[begi:endi+1])
    else:
        segment = lines[begi:endi+1]
    return segment[0]
    

def bash(cmd): #calls command in bash shell, returns output
    process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
    output, error = process.communicate()
    return output
working on rebuilding everything 2019-02-27 22:29:45 +00:00
			`import troggle.core.models as models #import models for various objects`
			`from django.conf import settings`
			`import xml.etree.ElementTree as ET #this is used to parse XML's`
			`import subprocess`
			`import re`

			`#`
			`# This parser has to find several things:`
			`# There are files of .html format in expoweb area - they contain some of the important information`
			`# There is a similar number of .svx files in loser are - they contain all the measurements`
			`#`
			`# Previous version was incredibly slow due to various shitty ideas about finding things`
			`# and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell`
			`# and handles more sophisticated bits only`
			`#`

			`def load():`
			`print('Hi! I\'m caves parser. Ready to work')`

			`print('Loading caves of 1623 area')`
			`loadarea('caves-1623/')`


			`def loadarea(areacode):`


			`print('Searching all cave dirs files')`
			`basedir = settings.SURVEX_DATA+areacode`

			`bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')`

			`cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories`
			`print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')`
			`ndirs = len(cavedirs) #remember number of dirs for nice debug output`

			`for cavedir in cavedirs:`
			`if cavedir==basedir:`
			`continue #skip the basedir - a non-proper subdirectory`
			`cavename = bash('echo '+cavedir+' \| rev \| cut -f1 -d \'/\' \| rev').splitlines()[0] #get final bit of the directory`

			`test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence`
			`if 'MISSING' in test: #send error message to the database`
			`msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')`
			`print('Cave missing'+cavename+' :(')`
			`msg.save()`
			`continue`
			`fullname=cavedir+'/'+cavename+'.svx'`
			`print('Found cave:'+cavename)`
			`cavernout = bash('cavern -q '+fullname) #make cavern process the thing`
			`if 'cavern: error:' in cavernout:`
			`msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')`
			`print('Fucked svx'+cavename+' :(')`
			`msg.save()`
			`continue`

			`cavernout = cavernout.splitlines()`
			`depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])`
			`length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])`
			`surveyname = bash('cat '+fullname+' \| grep \'\*begin\' \| head -n1 \| cut -f2 -d \' \' ').splitlines().pop()`
			`title = (bash('cat '+fullname+' \| grep \'\*title\' \| head -n1 \| cut -f2 -d \' \' ').splitlines() or ["Not found"])[0]`
			`print((('depth','length','surv name'),(depth,length,surveyname)))`
			`print('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d \| grep NODE \| grep \'\\[\\.'+surveyname+'.*\\]\'')`
			`nodes = bash('dump3d '+settings.SURVEX_DATA+'1623-and-1626.3d \| grep NODE \| grep \'\\[.\\.'+surveyname+'.\\]\'').splitlines()`
			`entran = [x for x in nodes if ('ENTRANCE' in x) ]`
			`print(nodes)`


			`newcave = models.CaveM(survex_file = fullname, total_length = length, name=title, total_depth = depth)`
			`newcave.save()`
			`#end of reading survex masterfiles`

			`print ("Reading cave descriptions")`
			`cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()`
			`for fn in cavefiles:`
			`f = open(fn, "r")`
			`print(fn)`
			`contents = f.read()`

			`desc = extractXML(contents,'underground_description')`
			`name = re.search(r'>.*<',extractXML(contents,'caveslug')).group()[6:-1]`

			`if desc==None or name==None:`
			`msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')`
			`print('Fucked description '+fn+' :(')`
			`msg.save()`
			`continue`


			`updatecave = models.CaveM.objects.filter(survex_file__icontains='/'+name+'.svx')`
			`if len(updatecave)>1:`
			`print('Non unique solution - skipping. Name:'+name)`
			`elif len(updatecave)==0:`
			`print('Cave with no survex data'+name)`
			`newcave = models.CaveM(description = desc, name = name)`
			`newcave.save()`
			`else: #exaclty one match`
			`updatecave = updatecave[0]`
			`updatecave.description = desc`
			`if updatecave.name=="Not found":`
			`updatecave.name=name`
			`updatecave.title=name`
			`updatecave.save()`


			`#end of reading cave descriptions`



			`def extractXML(contents,tag):`
			`#find correct lines`
			`lines = contents.splitlines()`
			`beg = [x for x in lines if ('<'+tag+'>' in x)]`
			`end = [x for x in lines if ('</'+tag+'>' in x)]`
			`if (not beg) or (not end):`
			`return None`
			`begi = lines.index(beg[0])`
			`endi = lines.index(end[0])`
			`if endi!=begi:`
			`segment = '\n'.join(lines[begi:endi+1])`
			`else:`
			`segment = lines[begi:endi+1]`
			`return segment[0]`


			`def bash(cmd): #calls command in bash shell, returns output`
			`process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)`
			`output, error = process.communicate()`
			`return output`