-
Notifications
You must be signed in to change notification settings - Fork 3
/
klk-add-binding-attrs.sh
executable file
·78 lines (63 loc) · 2.06 KB
/
klk-add-binding-attrs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#! /bin/sh
# -*- coding: utf-8 -*-
# Add the s-attributes text_binding_id and text_publ_type to the KLK
# corpora specified as command-line arguments
progdir=$(dirname $0)
origdatadir=/v/corpora/vrt/klk_links
cwb_regdir=/v/corpora/registry
cwb_datadir=/v/corpora/data
cwb_bindir=/usr/local/cwb/bin
origdatafile=$origdatadir/digi_content.csv
binding_info_file=$origdatadir/binding_info.tsv
export LC_ALL=C
init () {
if ! [ -s $binding_info_file ]; then
$progdir/klk-extract-binding-info.py $origdatafile |
LC_ALL=C sort > $binding_info_file
fi
}
add_attr () {
corpus=$1
elem=$2
attrname=$3
regfile=$cwb_regdir/$corpus
# Edit registry file directly instead of using cwb-regedit so that
# the attribute is added to the same group "XML element"
grep -q "^STRUCTURE ${elem}_$attrname " $regfile || {
cp -p $regfile $regfile.old
awk '/^# <'$elem' / { sub (/">/, "\" '$attrname'=\"..\">") }
/^STRUCTURE '$elem'[ _]/ { elems = 1 }
/^$/ && elems {
printf "STRUCTURE %-20s # [annotations]\n", "'${elem}_$attrname'";
elems = 0 }
{ print }' \
$regfile.old > $regfile
}
# cwb-regedit -r $cwb_regdir $(echo $corpus | sed -e 's/.*/\U&\E/') \
# :add :s text_$attrname
}
encode () {
corpus=$1
attrname=$2
fields=$3
cut -d' ' -f$fields $origdatadir/binding_info_$corpus.tsv |
$cwb_bindir/cwb-s-encode -r $cwb_regdir -d $cwb_datadir/$corpus \
-V text_$attrname
add_attr $corpus text $attrname
}
add_binding () {
corpus=$(basename $1)
$cwb_bindir/cwb-s-decode -r $cwb_regdir $corpus -S text_img_url |
perl -ne '/(.*\t)#DIRECTORY#(.*?)#SEPARATOR#/; print "$1$2\n"' |
sort -t' ' -s -k3,3 > $origdatadir/img_urls_$corpus.tsv
join -t' ' -13 -21 -a1 -o '1.1 1.2 1.3 2.2 2.3' \
$origdatadir/img_urls_$corpus.tsv $binding_info_file |
sort -s -k1,1n > $origdatadir/binding_info_$corpus.tsv
encode $corpus binding_id 1,2,4
encode $corpus publ_type 1,2,5
}
init
for corpus in $@; do
echo $corpus
add_binding $corpus
done