-
Notifications
You must be signed in to change notification settings - Fork 8
/
replace_matches.awk
108 lines (103 loc) · 3.57 KB
/
replace_matches.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/awk -f
#
# script loads a file containing words and their subtitutions
# (same syntax as property files). then a specific field of the input
# file is replaced with the relevant substitution, if it exists.
# examples from substitutions file:
# create a file with key/value entries e.g. "subst.txt", one entry per row such as:
#
# EUR=Europe
# USA=United States of America
# MEA=Middle East
#
# when you run this script pass the name of the file with the substitutions
# to it.
# you also need to specify which field (column) of the input file you want
# to have checked and substituted.
#
# example input file:
#
# hallo;blue;winter;EUR;fifteen
# hello;red;summer;USA;three
# haji;yellow;summer;AFR;ninehundred
#
# save this example as "inputfile.csv". then run this awk script from the appropriate
# folder as follows:
#
# ./replace_matches.awk -v fieldnumber=4 -v substitutionfile=subst.txt inputfile.csv
#
# process description: the script reads the entries of the substitution file subst.txt
# and replaces all occurrences in field/column number 4 in the inputfile.
# so e.g. all entries that have the value "EUR" in column 4 of the input file will
# be changed to read "Europe" and all "USA" values in column 4 will be replaced with
# "United States of America" instead.
#
# Note: be careful not to replace existing data with data that contains the seperator
# character (that is used to devide the individual columns from each other).
# otherwise you will create output, where the rows might have different numbers of
# columns.
#
# Note: watch out in the substitution file that the devider used to seperate the key from
# the value does not appear elsewhere in the data. e.g. if you use a "=" as the
# devider, it should not appear in the data itself otherwise you might get undesired
# results.
#
#
# uwe.geercken@datamelt.com
# http://datamelt.com
#
# last update: 2010-01-14
#
#
# before we processs the main file, we load the matching file into memory in an array.
BEGIN {
# define input and output field seperators
FS=";";
OFS=";";
if(devider) # devider between key and value paris in the substitutions file
{
substitutions_devider=devider;
}
else
{
substitutions_devider="=";
}
# loop over all lines in the substitution file
while ( (getline substitution < substitutionfile) > 0 )
{
# get the position of the devider
pos = index(substitution,substitutions_devider);
# get the first part of the line, which will be the key in the
# array that will be used for the sustitutions
key=substr(substitution,1,pos-1);
if(key !="" && substr(key,1,1)!="#") # only if not empty and not a comment
{
# value holds the actual substitution
value=substr(substitution,pos+1);
# add to the associative array
entry[key]=value;
}
}
}
# the following will be executed for each row of the input file.
{
# the replacement_value is the value that will be used instead of the
# original one, which is in the given fieldnumber (column).
replacement_value=entry[$fieldnumber]
# we replace the value only, if we found a match to it in the array
if(replacement_value)
{
# counter++;
# print "line " NR ": [" $fieldnumber "] replaced by [" replacement_value "]";
# replace the original value of the field with the one that matched
$fieldnumber = replacement_value;
}
else # if we do not have a match in the array, this means that for the given value
# in the input file there is no equivalent substitution value defined in the
# substitution file.
{
#counter_notmatched++;
# print "line " NR ": [" $fieldnumber "] not in array";
}
print;
}