#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import os
import re
import sys
if(len(sys.argv) != 2):
print "Usage: "+str(sys.argv[0])+" filename.ext"
sys.exit("Wrong usage, make sure filename has an extension")
filepath=os.path.basename(sys.argv[1])
filename=os.path.splitext(filepath)[0]
extension=os.path.splitext(filepath)[1]
print "Filename: "+filename
print "Extension: "+extension
counter=0
backup_for_nullbyte=-1
cols_to_ignore = []
cols_to_fix = []
# The CSV Module can't handle NULL-Bytes, therefore, we're replacing them temporarily
print "Temporarily replacing NULLBYTE ..."
fi = open(filename+extension, 'rb')
data = fi.read()
fi.close()
# We want to replace the NUL Bytes with another unused special char, therefore we're testing
# the 3 DC-Bytes (DC4 is already used by concordance as seperator)
for i in range(3):
if (data.find(chr(17+i)) == -1):
backup_for_nullbyte = chr(17+i)
print "Replacing NULLBYTE 0x00 with "+hex(ord(backup_for_nullbyte))
break;
if(backup_for_nullbyte == -1):
sys.exit("ABORTING: Wasn't able to securely backup NULL-Bytes.")
fo = open(filename+'_no_null'+extension, 'wb')
fo.write(data.replace('\x00', backup_for_nullbyte))
fo.close()
print "Done"
def fixdate(date):
global counter
valid=True
# Slice away the quotingchars from concordance
temp_date = date[2:-2]
# Plausability checks:
# Check if date has more than 19 digits (e.g. 5 digits for the year)
if(len(temp_date) > 19):
valid=False
# Check if year is to early, e.g. prior 1950
elif(int(temp_date[0:4]) < 1950):
valid=False
elif(int(temp_date[0:4]) > 2050):
valid=False
# Insert further checks here
if(not valid):
# print erroneus date
print date
date = "þ1970/01/01 00:00:00þ"
counter+=1
return date
print "Fixing CSV ..."
with open(filename+'_no_null'+extension, 'rb') as csvfile, open(filename+'_no_null_output'+extension, 'wb') as outfile:
csvreader = csv.reader(csvfile, delimiter='\x14')
csvwriter = csv.writer(outfile, delimiter='\x14', quotechar='\x00', quoting=csv.QUOTE_NONE)
# Extract Tableheaders to prompt them later in the accept/ignore list
table_header = next(csvreader, None)
for i, element in enumerate(table_header):
table_header[i] = element[2:-2]
csvwriter.writerow(table_header) # copy header line to new file
# Start actual read/modify/write process
for row in csvreader:
# Advanced python loop: i is an index, cell is the same element as row[i].
for i, cell in enumerate(row):
# skip check if user decided to ignore this column
if(i in cols_to_ignore):
continue
# Find Timestamps in format YYYY(Y)/MM/DD HH:MM:SS (if 5. Y exists, it's a faulty entry)
if(re.match("þ\d{4,5}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}þ", cell)):
if(i not in cols_to_fix):
while(True):
choice = str(raw_input("This is the first time I found a date in column '"+table_header[i]+"', do you want me to fix or ignore possible errors in this column? (F)ix/(i)gnore "))
if(choice == 'F' or choice == 'f' or not choice):
cols_to_fix.append(i)
row[i] = fixdate(cell)
break
elif(choice == 'i'):
cols_to_ignore.append(i)
break
else:
row[i] = fixdate(cell)
csvwriter.writerow(row)
print "Done"
# Restore the NULL-Bytes
print "Restoring NULLBYTE ..."
fi = open(filename+'_no_null_output'+extension, 'rb')
data = fi.read()
fi.close()
fo = open(filename+'_fixed'+extension, 'wb')
fo.write(data.replace(backup_for_nullbyte, '\x00'))
fo.close()
print "Done"
print "Fixed "+str(counter)+" Dates in total"