bobbycar/logdata/logfix.py

import numpy as np
from datetime import datetime, timezone
import pytz
import time
import argparse
import os.path

parser = argparse.ArgumentParser(description='Copys, renames and fixes logfiles written by bobbycar sd logger.')
parser.add_argument('-i', '--input', type=argparse.FileType('r'), nargs='+', required=True, help="list of input log files")
parser.add_argument('-o', '--output', nargs='?', type=str, help="output filename")
parser.add_argument('-c','--consecutive', action="store_true", help="add consecutive files to input. If the input file ends with a number the following logfiles will be added.")
parser.add_argument("-t", "--time", nargs='?', type=int, help="Create new csv file after time without data. In Minutes.")
args = parser.parse_args()


ok=True


def getTimestamp(plines):
    timestampline=-1
    timestampfound=False
    while not timestampfound:
        timestampline+=1
        timestampfound = (plines[timestampline].find('TIMESTAMP:')!=-1)

    timestamp=int(plines[timestampline].split('TIMESTAMP:')[1]) #timestamp when file was created

    if (timestampline==-1):
        print("Error: Timestamp not found!")
        exit()

    return timestamp


def filterLines(plines,plinesStarttime=None):

    plines = [x.rstrip("\n") for x in plines] #remove \n
    pcommentlinesMask = [True if x.startswith('#') else False for x in plines] #generate mask for lines with comments


    plines=np.array(plines)
    pcommentlinesMask=np.array(pcommentlinesMask)

    if (plinesStarttime is not None): #if starttimelist given, match with pdatalinesOK
        plinesStarttime = plinesStarttime[pcommentlinesMask==False] #get lines with data

    pdatalines = plines[pcommentlinesMask==False] #get lines with data


    pheader = pdatalines[0] #header is the first non comment line

    pheaderSize = len(pheader.split(',')) #how many elements are expected per line
    pdatalinesSize = [len(x.split(',')) for x in pdatalines] #count arraysize for every dataline

    if (plinesStarttime is not None): #if starttimelist given, match with pdatalinesOK
        plinesStarttime=plinesStarttime[np.array(pdatalinesSize)==pheaderSize]

    pdatalinesOK = pdatalines[np.array(pdatalinesSize)==pheaderSize]

    if (plinesStarttime is not None): #if starttimelist given, match with pdatalinesOK
        plinesStarttime = [plinesStarttime[i] for i,x in enumerate(pdatalinesOK) if x != pheader]

    pdatalinesOK = [x for x in pdatalinesOK if x != pheader] #exclude header from data lines


    pdatalinesFail = pdatalines[np.array(pdatalinesSize)!=pheaderSize]


    plinesSize = [len(x.split(',')) for x in plines] #count arraysize for every dataline
    plinesOK = np.array(plinesSize)==pheaderSize #mask for okay lines (valid for data lines)

    return plines,pheader,pcommentlinesMask,pdatalines,pdatalinesFail,pdatalinesOK,pheaderSize,plinesOK,plinesStarttime


inputFilenames=[]


if (args.consecutive):
    if(len(args.input)!=1):
        parser.error("in consequtive mode exactly one input file is required")
        exit()

    nextFilename=args.input[0].name
    while os.path.isfile(nextFilename):
        print(nextFilename+" exists")
        inputFilenames.append(nextFilename)

        digitStartpos=len(nextFilename)-1
        digitEndpos=len(nextFilename)


        while (not nextFilename[digitStartpos:digitEndpos].isdigit() and digitStartpos>0 and digitEndpos>0):
            digitStartpos-=1
            digitEndpos-=1

        while (nextFilename[digitStartpos:digitEndpos].isdigit() and digitStartpos>0 and digitEndpos>0):
            digitStartpos-=1

        digitStartpos+=1


        number=int(nextFilename[digitStartpos:digitEndpos])+1
        nextFilename=nextFilename[0:digitStartpos]+str(number).zfill(digitEndpos-digitStartpos)+nextFilename[digitEndpos:]


else:
    inputFilenames=[x.name for x in args.input]


lines=[]
linesStarttime=[] #offset for every line with timestamp. will be combined to new column
header=""
for inputFilename in inputFilenames:
    print("Reading "+str(inputFilename))
    inputlines=[]
    with open(inputFilename, 'r') as reader:
        inputlines = reader.readlines()

    lines+=inputlines

    #Check Headers
    _lines,_header,_,_,_,_,_,_,_=filterLines(inputlines)

    if (header==""): #is first header
        header=_header

    assert header==_header, "Header is different!"

    _timestamp=getTimestamp(_lines)
    print("Timestamp="+str(_timestamp))
    _linesStarttime=[_timestamp for x in inputlines] #create as many entries with start timestamp as there are lines in the current file

    linesStarttime+=_linesStarttime

    print("Lines in file="+str(len(inputlines)))

assert len(lines)==len(linesStarttime), "Length of lines and linesStarttime does not match"

linesStarttime=np.array(linesStarttime)
lines,header,commentlinesMask,datalines,datalinesFail,datalinesOK,headerSize,linesOK,linesStarttime=filterLines(lines,linesStarttime)

print("Found "+str(len(lines))+" lines")
print(str(np.sum(commentlinesMask))+" comments")
print(str(len(datalinesFail))+" Datalines Failed")
print(str(len(datalinesOK))+" Datalines OK")
print("Header Size is "+str(headerSize))


timestamp=getTimestamp(lines) #get first timestamp

# Divide in sets for separate files based on off time between logs
last_timestamp=timestamp
next_timestamp=timestamp


set_datalinesOK=[]
set_linesStarttime=[]

last_idata=0

if args.time is not None:
    newfile_timestamp_difference=args.time
    for idata,data in enumerate(datalinesOK):
        last_timestamp=next_timestamp
        next_timestamp=linesStarttime[idata]+float(datalinesOK[idata].split(',')[0])
        if next_timestamp-last_timestamp>newfile_timestamp_difference: #diff too high, create new file
            set_datalinesOK.append(datalinesOK[last_idata:idata])
            set_linesStarttime.append(linesStarttime[last_idata:idata])
            last_idata=idata

set_datalinesOK.append(datalinesOK[last_idata:]) #append last set
set_linesStarttime.append(linesStarttime[last_idata:]) #append last set


for iset,current_datalinesOK in enumerate(set_datalinesOK):
    print("Creating Output for set "+str(iset))
    current_linesStarttime=set_linesStarttime[iset]

    timestamp=current_linesStarttime[0] #first line is start timestamp for this file

    #filetime = datetime.fromtimestamp(timestamp).astimezone(pytz.timezone('Europe/Berlin')).strftime('%Y%m%d_%H%M%S')
    filetime = datetime.utcfromtimestamp(timestamp).astimezone(pytz.timezone('Europe/Berlin')).strftime('%Y%m%d_%H%M%S') #Check if this still works in dst
    #filetime = datetime.fromtimestamp(timestamp, tz=timezone.utc).astimezone(pytz.timezone('Europe/Berlin')).strftime('%Y%m%d_%H%M%S') #is +2h at least in summer


    outputFilename=""
    if args.output is not None:
        outputFilename="_"+args.output

    outputFilename = filetime+outputFilename+".csv"

    #is_dst(datetime(2019, 4, 1), timezone="US/Pacific")
    print("Timestamp:"+str(timestamp)+" -> "+str(filetime))
    print("UTC: "+ datetime.utcfromtimestamp(timestamp).strftime('%A, %Y-%m-%d %H:%M:%S'))
    #print("Local Time:"+time.strftime('%A, %Y-%m-%d %H:%M:%S', time.localtime(timestamp)))


    print("Writing to: "+str(outputFilename))


    header="timestamp,"+header #add timestamp column

    writelines = [str(current_linesStarttime[i]+float(x.split(',')[0]))+","+x for i,x in enumerate(current_datalinesOK)] #add file timestamp to line time and add column to data

    linesWritten = 0
    if ok:
        with open(outputFilename, 'w') as writer:
            writer.write(header+"\n") #write header
            for i,line in enumerate(writelines):
                writer.write(line+"\n")
                linesWritten+=1

        print(str(linesWritten)+" lines written to "+str(outputFilename))
    else:
        print("Failed!")