troggle-unchained/parsers/cavesM.py


import troggle.core.models as models #import models for various objects
from django.conf import settings
import xml.etree.ElementTree as ET #this is used to parse XML's
import subprocess
import re

#
#    This parser has to find several things:
#    There are files of .html format in expoweb area - they contain some of the important information
#    There is a similar number of .svx files in loser are - they contain all the measurements
#
#    Previous version was incredibly slow due to various shitty ideas about finding things
#    and overelayance on python when handling regular expressions, new version delegates heavy lifting to shell
#    and handles more sophisticated bits only
#

def load():
    print('Hi! I\'m caves parser. Ready to work')

    print('Loading caves of 1623 area')
    loadarea('1623')


def loadarea(areacode):

    if not file_exists(settings.SURVEX_DATA+'1623-and-1626.3d'):
        print('Computing master .3d file')
        bash('cavern -o'+settings.SURVEX_DATA+' '+settings.SURVEX_DATA+'1623-and-1626.svx')
    else:
        print('Loading from existing master .3d file')

    master3d = bash('dump3d -d '+settings.SURVEX_DATA+'1623-and-1626.3d').splitlines()
    master3dN = [x for x in master3d if ('NODE' in x)] #list of nodes of master survex file
    master3dL = [x for x in master3d if ('LINE' in x)] #list of nodes of master survex file

    print('Searching all cave dirs files')
    basedir = settings.SURVEX_DATA+'caves-'+areacode+'/'

    cavedirs = bash("find "+basedir+" -maxdepth 1 -type d").splitlines() #this command finds all directories
    print('Obtained list of directories! (#dirs='+str(len(cavedirs))+')')
    ndirs = len(cavedirs) #remember number of dirs for nice debug output

    for cavedir in cavedirs:
        if cavedir==basedir:
            continue #skip the basedir - a non-proper subdirectory
        cavename = bash('echo '+cavedir+' | rev | cut -f1 -d \'/\' | rev').splitlines()[0] #get final bit of the directory

        test = bash('if [ ! -f '+cavedir+'/'+cavename+'.svx ] ; then echo MISSING; fi')#test for file exisence
        if not file_exists(cavedir+'/'+cavename+'.svx'):
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' MISSING!',message_type='warn')
            print('Cave missing'+cavename+' :(')
            msg.save()
            continue
        fullname=cavedir+'/'+cavename+'.svx'
        print('Found cave:'+cavename)
        cavernout = bash('cavern -o '+cavedir+' '+fullname) #make cavern process the thing
        if 'cavern: error:' in cavernout:
            msg = models.Parser_messageM(parsername='caves',content=cavedir+'/'+cavename+' Survex file messed up!',message_type='warn')
            print('Fucked svx'+cavename+' :(')
            msg.save()
            continue

        cavernout = cavernout.splitlines()
        depth = float(([x for x in cavernout if ('Total vertical length' in x)][0].split()[-1])[:-2])
        length = float(([x for x in cavernout if ('Total length' in x)][0].split()[6])[:-1])
        cavefile = open(fullname,'r')
        cavefilecontents = cavefile.read().splitlines()
        surveyname = [x for x in cavefilecontents if ('*begin ') in x][0].split()[1].lower()
        try:
            title = [x for x in cavefilecontents if ('*title ') in x][0].split()[1]
        except:
            syrveyname = "Untitled"

        relevant_nodes = [x for x in master3dN if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]
        entrance_nodes = [x for x in relevant_nodes if 'ENTRANCE' in x]
        surface_nodes = [x for x in relevant_nodes if 'SURFACE' in x]
        location_nodes = []
        print('rel_nodes'+str(len(relevant_nodes)))
        if len(entrance_nodes) > 0:
            location_nodes = entrance_nodes
        elif len(surface_nodes) > 0:
            location_nodes = surface_nodes
        elif len(relevant_nodes) > 0:
            location_nodes = relevant_nodes

        try:
            location = sorted(location_nodes, key = lambda y : float(y.split()[3])).pop()
        except:
            print(location_nodes)
            location = 'Not found'

        relevant_lines = [x for x in master3dL if (('['+areacode+'.'+surveyname+'.' in x) or ('['+areacode+'.'+surveyname+']' in x))]
        try:
            lastleg = sorted(relevant_lines, key = lambda y : y.split().pop()).pop()
        except:
            lastleg = ['LINE 1900.01.01']
        try:
            lastdate = lastleg.split().pop()
            if 'STYLE' in lastdate:
                lastdate = lastleg.split().pop().pop()
        except:
            lastdate = '1900.01.01'

        entrance = ' '.join(location.split()[1:3])
        print((('depth','length','surv name','entr','date'),(depth,length,surveyname,entrance,lastdate))) #sanity check print


        newcave =  models.CaveM(
            survex_file = fullname,
            total_length = length,
            name=areacode+'.'+surveyname,
            total_depth = depth,
            date = lastdate,
            entrance = entrance)
        newcave.save()
    #end of reading survex masterfiles

    print ("Reading cave descriptions")
    cavefiles = bash('find '+settings.CAVEDESCRIPTIONS+' -name \'*.html\'').splitlines()
    for fn in cavefiles:
        f = open(fn, "r")
        print(fn)
        contents = f.read()

        slug = re.sub(r"\s+", "", extractXML(contents,'caveslug'))
        desc = extractXML(contents,'underground_description')
        name = slug[5:] #get survex compatible name
        area = slug[0:4]

        print([area,name])

        if desc==None or name==None:
            msg = models.Parser_messageM(parsername='caves',content=fn+' Description meesed up!',message_type='warn')
            print('Fucked description '+fn+' :(')
            msg.save()
            continue

            print(area+'/'+name+'/'+name+'.svx')

        updatecave = models.CaveM.objects.filter(survex_file__icontains=area+'/'+name+'/'+name+'.svx')
        if len(updatecave)>1:
            print('Non unique solution - skipping. Name:'+name)
        elif len(updatecave)==0:
            print('Cave with no survex data:'+name)
            continue
        else: #exaclty one match
            print('Adding desc:'+name)
            updatecave = updatecave[0]
            updatecave.description = '/cave/descriptionM/'+slug #area-name
            updatecave.title=name
            updatecave.save()

            slugS = slug
            explorersS = extractXML(contents,'explorers')
            underground_descriptionS = extractXML(contents,'underground_description')
            equipmentS = extractXML(contents,'equipment')
            referencesS = extractXML(contents,'references')
            surveyS = extractXML(contents,'survey')
            kataster_statusS = extractXML(contents,'kataster_status')
            underground_centre_lineS = extractXML(contents,'underground_centre_line')
            survex_fileS = extractXML(contents,'survex_file')
            notesS = extractXML(contents,'notes')


            newcavedesc =  models.Cave_descriptionM(
            slug = slugS,
            explorers = explorersS,
            underground_description = underground_descriptionS,
            equipment = equipmentS,
            references = referencesS,
            survey = surveyS,
            kataster_status = kataster_statusS,
            underground_centre_line = underground_centre_lineS,
            survex_file = survex_fileS,
            notes = notesS)
            newcavedesc.save()


    #end of reading cave descriptions

def file_exists(filename):
    test = bash('if [ ! -f '+filename+' ] ; then echo MISSING; fi')#test for file exisence
    if 'MISSING' in test: #send error message to the database
        return False
    return True

def extractXML(contents,tag):
    #find correct lines
    lines = contents.splitlines()
    beg = [x for x in lines if ('<'+tag+'>' in x)]
    end = [x for x in lines if ('</'+tag+'>' in x)]
    if (not beg) or (not end):
        return None
    begi = lines.index(beg[0])
    endi = lines.index(end[0])
    if endi!=begi:
        segment = '\n'.join(lines[begi:endi+1])
    else:
        segment = lines[begi:endi+1][0]

    hit = re.findall('<'+tag+'>(.*)</'+tag+'>', segment, re.S)[0]
    return hit

def bash(cmd): #calls command in bash shell, returns output
    process = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE)
    output, error = process.communicate()
    return output