Ansichten: QuotePaste - CodePaste - NoPaste
Codesnippet eingetragen am 27.4.2015 um 15:43
Von: Michael
Sprache: Python
Beschreibung: Check and Fix broken (specific criterias apply) Dates in CSV file
CodeSnippet:
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. import csv
  4. import os
  5. import re
  6. import sys
  7.  
  8. if(len(sys.argv) != 2):
  9. print "Usage: "+str(sys.argv[0])+" filename.ext"
  10. sys.exit("Wrong usage, make sure filename has an extension")
  11.  
  12. filepath=os.path.basename(sys.argv[1])
  13. filename=os.path.splitext(filepath)[0]
  14. extension=os.path.splitext(filepath)[1]
  15. print "Filename: "+filename
  16. print "Extension: "+extension
  17.  
  18. counter=0
  19. backup_for_nullbyte=-1
  20. cols_to_ignore = []
  21. cols_to_fix = []
  22.  
  23.  
  24. # The CSV Module can't handle NULL-Bytes, therefore, we're replacing them temporarily
  25. print "Temporarily replacing NULLBYTE ..."
  26. fi = open(filename+extension, 'rb')
  27. data = fi.read()
  28. fi.close()
  29. # We want to replace the NUL Bytes with another unused special char, therefore we're testing
  30. # the 3 DC-Bytes (DC4 is already used by concordance as seperator)
  31. for i in range(3):
  32. if (data.find(chr(17+i)) == -1):
  33. backup_for_nullbyte = chr(17+i)
  34. print "Replacing NULLBYTE 0x00 with "+hex(ord(backup_for_nullbyte))
  35. break;
  36. if(backup_for_nullbyte == -1):
  37. sys.exit("ABORTING: Wasn't able to securely backup NULL-Bytes.")
  38.  
  39. fo = open(filename+'_no_null'+extension, 'wb')
  40. fo.write(data.replace('\x00', backup_for_nullbyte))
  41. fo.close()
  42. print "Done"
  43.  
  44. def fixdate(date):
  45. global counter
  46. valid=True
  47. # Slice away the quotingchars from concordance
  48. temp_date = date[2:-2]
  49.  
  50. # Plausability checks:
  51. # Check if date has more than 19 digits (e.g. 5 digits for the year)
  52. if(len(temp_date) > 19):
  53. valid=False
  54. # Check if year is to early, e.g. prior 1950
  55. elif(int(temp_date[0:4]) < 1950):
  56. valid=False
  57. elif(int(temp_date[0:4]) > 2050):
  58. valid=False
  59. # Insert further checks here
  60.  
  61. if(not valid):
  62. # print erroneus date
  63. print date
  64. date = "þ1970/01/01 00:00:00þ"
  65. counter+=1
  66. return date
  67.  
  68. print "Fixing CSV ..."
  69. with open(filename+'_no_null'+extension, 'rb') as csvfile, open(filename+'_no_null_output'+extension, 'wb') as outfile:
  70. csvreader = csv.reader(csvfile, delimiter='\x14')
  71. csvwriter = csv.writer(outfile, delimiter='\x14', quotechar='\x00', quoting=csv.QUOTE_NONE)
  72.  
  73. # Extract Tableheaders to prompt them later in the accept/ignore list
  74. table_header = next(csvreader, None)
  75. for i, element in enumerate(table_header):
  76. table_header[i] = element[2:-2]
  77. csvwriter.writerow(table_header) # copy header line to new file
  78.  
  79. # Start actual read/modify/write process
  80. for row in csvreader:
  81. # Advanced python loop: i is an index, cell is the same element as row[i].
  82. for i, cell in enumerate(row):
  83. # skip check if user decided to ignore this column
  84. if(i in cols_to_ignore):
  85. continue
  86. # Find Timestamps in format YYYY(Y)/MM/DD HH:MM:SS (if 5. Y exists, it's a faulty entry)
  87. if(re.match(\d{4,5}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}þ", cell)):
  88. if(i not in cols_to_fix):
  89. while(True):
  90. choice = str(raw_input("This is the first time I found a date in column '"+table_header[i]+"', do you want me to fix or ignore possible errors in this column? (F)ix/(i)gnore "))
  91. if(choice == 'F' or choice == 'f' or not choice):
  92. cols_to_fix.append(i)
  93. row[i] = fixdate(cell)
  94. break
  95. elif(choice == 'i'):
  96. cols_to_ignore.append(i)
  97. break
  98. else:
  99. row[i] = fixdate(cell)
  100.  
  101. csvwriter.writerow(row)
  102.  
  103. print "Done"
  104.  
  105. # Restore the NULL-Bytes
  106. print "Restoring NULLBYTE ..."
  107. fi = open(filename+'_no_null_output'+extension, 'rb')
  108. data = fi.read()
  109. fi.close()
  110. fo = open(filename+'_fixed'+extension, 'wb')
  111. fo.write(data.replace(backup_for_nullbyte, '\x00'))
  112. fo.close()
  113. print "Done"
  114.  
  115. print "Fixed "+str(counter)+" Dates in total"