-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathread.sh
executable file
·106 lines (89 loc) · 3.18 KB
/
read.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/bin/bash
elasticsearchUrl='http://localhost:9200'
index="fiscal_v1"
# Add index template
echo "Adding index template (fiscal_template.json): "
curl -XPUT "${elasticsearchUrl}/_template/fiscal_template" -H "Content-Type: application/json" -d @fiscal_template.json
echo
echo
# Add pipeline for ingesting csv data
echo "Adding pipeline for ingesting csv data (parse_fiscal_pipeline.json): "
curl -XPUT "${elasticsearchUrl}/_ingest/pipeline/parse_fiscal" -H "Content-Type: application/json" -d @parse_fiscal_pipeline.json
echo
echo
# Prompt to delete if index exists
status=$(curl -s -o /dev/null -I -w "%{http_code}" http://localhost:9200/${index})
if [[ $status != 404 ]]; then
read -p "The index ${index} exists, do you want to delete and re-import the data? " -n 1 -r
echo # (optional) move to a new line
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo "Deleting index ${index}"
curl -XDELETE "${elasticsearchUrl}/${index}"
echo
echo
else
echo "Did not answer yes, exiting."
exit 1
fi
fi
# Add the index
echo "Adding index ${index}"
curl -XPUT "${elasticsearchUrl}/${index}"
echo
echo
dataFile="data.zip"
# Grab the file if it doesn't exist - change this link if the data updates
if [ ! -f $dataFile ]; then
echo "Downloading data file ${dataFile}"
wget -O $dataFile "https://file.ac/download-all/21euCO1oZvDIdainl9w56Q/"
unzip $dataFile -d data
echo
fi
# Turn csv into bulk request format
echo "Processing csv into bulk request format - this may take a few minutes"
if [ ! -f requests.jsonl ]; then
for f in data/*
do
tail -n +2 $f | sed -e `printf 's/\r$//g'` -e 's/"/\\"/g' | xargs -d '\n' printf '{"index":{"_index":"'"$index"'","pipeline":"parse_fiscal"}\n{"budget":"%s"}\n' >> requests.jsonl
done
fi
# Clean previous
rm -rf split
mkdir split
# Split file into chunks to bulk index
split -l 5000 -a 4 requests.jsonl split/
# Submit file as a bulk indexing request to elastic using the parse_fiscal pipeline
echo "Indexing data file"
total=$(ls split | wc -l | bc)
i=0;
echo -ne "[# ] 0/${total} (0%)\r"
for f in split/*
do
i=$(($i+1))
# Need to add a newline at the end of each file
last=$(tail -n 1 $f)
if [[ ! -z "${last}" ]]; then
echo "" >> "${f}"
fi
# The mod value (default 2) represents about how many threads to use to make bulk requests
# This can be set higher for higher import throughput, but if it is set too high, it could
# cause garbage collection issues and/or crash elasticsearch.
# Do not set this higher than your total threads
mod=$((${i}%2))
if [[ $mod == 0 ]]; then
curl -s -XPOST "http://localhost:9200/_bulk" -H "Content-Type: application/x-ndjson" --data-binary "@${f}" > /dev/null
else
curl -s -XPOST "http://localhost:9200/_bulk" -H "Content-Type: application/x-ndjson" --data-binary "@${f}" > /dev/null &
fi
progress='#'
ticks=$(printf "%.0f" $(bc <<< "scale=1; ${i}/${total}*10"))
for ((j=0;j<10;j++)); do
if (( $ticks > $j )); then progress="${progress}#"; else progress="${progress} "; fi
done
percent=$(printf "%.0f" $(bc <<< "scale=2; ${i}/${total}*100"))
echo -ne "[${progress}] ${i}/${total} (${percent}%)\r"
done
rm -rf split
rm requests.jsonl
echo
echo "Import complete"