python - Detect header delimiters importing a csv file using pandas read_csv -
here go again.
hi, i'm trying detect error in csv file.
the file should follows
goodfile.csv "col_a","col_b","col_c","col_d" "row1cola","row1colb","row1colc","row1cold" "row2cola","row2colb","row2colc","row2cold" "row3cola","row3colb","row3colc","row3cold" "row4cola","row4colb","row4colc","row4cold" "row5cola","row5colb","row5colc","row5cold" "row6cola","row6colb","row6colc","row6cold" "row7cola","row7colb","row7colc","row7cold"
but file have actually
brokenfile.csv "col_a","col_b",col c,"col_d" "row1cola","row1colb","row1colc","row1cold" "row2cola","row2colb","row2colc","row2cold" "row3cola","row3colb","row3colc","row3cold" "row4cola","row4colb","row4colc","row4cold" "row5cola","row5colb","row5colc","row5cold" "row6cola","row6colb","row6colc","row6cold" "row7cola","row7colb","row7colc","row7cold"
when import 2 files pandas
data = pd.read_csv('goodfile.csv') data = pd.read_csv('brokenfile.csv')
i same result
data col_a col_b col_c col_d 0 row1cola row1colb row1colc row1cold 1 row2cola row2colb row2colc row2cold 2 row3cola row3colb row3colc row3cold 3 row4cola row4colb row4colc row4cold 4 row5cola row5colb row5colc row5cold 5 row6cola row6colb row6colc row6cold 6 row7cola row7colb row7colc row7cold
anyway, want detect error in second file "brokenfile.csv" lacks "" between header col_c
i think can detect missing "
in columns of dataframe
str.contains
, boolean indexing
inverted boolean array
~
:
import pandas pd import io temp=u'''"col_a","col_b",col c,"col_d" "row1cola","row1colb","row1colc","row1cold" "row2cola","row2colb","row2colc","row2cold" "row3cola","row3colb","row3colc","row3cold" "row4cola","row4colb","row4colc","row4cold" "row5cola","row5colb","row5colc","row5cold" "row6cola","row6colb","row6colc","row6cold" "row7cola","row7colb","row7colc","row7cold"''' #after testing replace io.stringio(temp) filename df = pd.read_csv(io.stringio(temp), quoting = 3) print df "col_a" "col_b" col c "col_d" 0 "row1cola" "row1colb" "row1colc" "row1cold" 1 "row2cola" "row2colb" "row2colc" "row2cold" 2 "row3cola" "row3colb" "row3colc" "row3cold" 3 "row4cola" "row4colb" "row4colc" "row4cold" 4 "row5cola" "row5colb" "row5colc" "row5cold" 5 "row6cola" "row6colb" "row6colc" "row6cold" 6 "row7cola" "row7colb" "row7colc" "row7cold" print df.columns index([u'"col_a"', u'"col_b"', u'col c', u'"col_d"'], dtype='object') print df.columns.str.contains('"') [ true true false true] print ~df.columns.str.contains('"') [false false true false] print df.columns[~df.columns.str.contains('"')] index([u'col c'], dtype='object')
Comments
Post a Comment