Michaels PasteCenter

#!/usr/bin/env python 
# -*- coding: utf-8 -*-
import csv
import os
import re
import sys
 
if(len(sys.argv) != 2):
  print "Usage: "+str(sys.argv[0])+" filename.ext"
  sys.exit("Wrong usage, make sure filename has an extension")
 
filepath=os.path.basename(sys.argv[1])
filename=os.path.splitext(filepath)[0]
extension=os.path.splitext(filepath)[1]
print "Filename: "+filename
print "Extension: "+extension
 
counter=0
backup_for_nullbyte=-1
cols_to_ignore = []
cols_to_fix = []
 
 
# The CSV Module can't handle NULL-Bytes, therefore, we're replacing them temporarily
print "Temporarily replacing NULLBYTE ..."
fi = open(filename+extension, 'rb')
data = fi.read()
fi.close()
# We want to replace the NUL Bytes with another unused special char, therefore we're testing
# the 3 DC-Bytes (DC4 is already used by concordance as seperator)
for i in range(3):
  if (data.find(chr(17+i)) == -1):
    backup_for_nullbyte = chr(17+i)
    print "Replacing NULLBYTE 0x00 with "+hex(ord(backup_for_nullbyte))
    break;
if(backup_for_nullbyte == -1):
  sys.exit("ABORTING: Wasn't able to securely backup NULL-Bytes.")
 
fo = open(filename+'_no_null'+extension, 'wb')
fo.write(data.replace('\x00', backup_for_nullbyte))
fo.close()
print "Done"
 
def fixdate(date):
  global counter
  valid=True
  # Slice away the quotingchars from concordance
  temp_date = date[2:-2]
 
  # Plausability checks:
  # Check if date has more than 19 digits (e.g. 5 digits for the year)
  if(len(temp_date) > 19):
   valid=False
  # Check if year is to early, e.g. prior 1950
  elif(int(temp_date[0:4]) < 1950):
    valid=False
  elif(int(temp_date[0:4]) > 2050):
    valid=False
  # Insert further checks here
 
  if(not valid):
   # print erroneus date
   print date
   date = "þ1970/01/01 00:00:00þ"
   counter+=1
  return date
 
print "Fixing CSV ..."
with open(filename+'_no_null'+extension, 'rb') as csvfile, open(filename+'_no_null_output'+extension, 'wb') as outfile:
  csvreader = csv.reader(csvfile, delimiter='\x14')
  csvwriter = csv.writer(outfile, delimiter='\x14', quotechar='\x00', quoting=csv.QUOTE_NONE)
 
  # Extract Tableheaders to prompt them later in the accept/ignore list
  table_header = next(csvreader, None)
  for i, element in enumerate(table_header):
    table_header[i] = element[2:-2]
  csvwriter.writerow(table_header) # copy header line to new file
 
  # Start actual read/modify/write process
  for row in csvreader:
    # Advanced python loop: i is an index, cell is the same element as row[i].
    for i, cell in enumerate(row):
      # skip check if user decided to ignore this column
      if(i in cols_to_ignore):
	continue
      # Find Timestamps in format YYYY(Y)/MM/DD HH:MM:SS (if 5. Y exists, it's a faulty entry)
      if(re.match("þ\d{4,5}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}þ", cell)):
	if(i not in cols_to_fix):
	  while(True):
	    choice = str(raw_input("This is the first time I found a date in column '"+table_header[i]+"', do you want me to fix or ignore possible errors in this column? (F)ix/(i)gnore  "))
	    if(choice == 'F' or choice == 'f' or not choice):
	      cols_to_fix.append(i)
	      row[i] = fixdate(cell)
	      break
	    elif(choice == 'i'):
	      cols_to_ignore.append(i)
	      break
	else:
	  row[i] = fixdate(cell)
 
    csvwriter.writerow(row)
 
print "Done"
 
# Restore the NULL-Bytes
print "Restoring NULLBYTE ..."
fi = open(filename+'_no_null_output'+extension, 'rb')
data = fi.read()
fi.close()
fo = open(filename+'_fixed'+extension, 'wb')
fo.write(data.replace(backup_for_nullbyte, '\x00'))
fo.close()
print "Done"
 
print "Fixed "+str(counter)+" Dates in total"