# when raw index has a lot of entries like # 1578324 problematico, a, ci, che # apply this algorithm: # treat things after comma as suffixes # for each suffix: # if single letter, replace last letter # else search backwards for beginning of suffix # and if it leads to an old suffix of approximately # the same length, put replace that suffix # This will still leave some commas to fix by hand # Usage: awk -F' ' -f comfix.awk rawindex > newrawindex NF == 2 { i = index($2, ",") if(i == 0 || length($2) == 0) print $0 else { n = split($2, a, /,[ ]*/) w = a[1] printf "%s\t%s\n", $1, w for(i = 2; i <= n; i++) { suf = a[i] m = matchsuflen(w, suf) if(m) { nw = substr(w, 1, length(w)-m) suf printf "%s\t%s\n", $1, nw } else printf "%s\t%s\n", $1, w ", " suf } } } NF != 2 { print $0 } function matchsuflen(w, suf, wlen,suflen,c,pat,k,d) { wlen = length(w) suflen = length(suf) if(suflen == 1) return 1 else { c = substr(suf, 1, 1) for (k = 1; k <= wlen ; k++) if(substr(w, wlen-k+1, 1) == c) break if(k > wlen) return 0 d = k-suflen if(d < 0) d = -d if(d > 3) return 0 return k } }