-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakefile
143 lines (110 loc) · 5.5 KB
/
makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
ROOT_DIR=/home/flinder/projects/text_reuse
DTA_DIR=$ROOT_DIR/data
ALIGN_DTA=$DTA_DIR/aligner_output
NCSL_DTA=$DTA_DIR/ncsl
IDEO_DTA=%DTA_DIR/ideology_analysis
TABLES=$ROOT_DIR/manuscript/tables
FIGURES=$ROOT_DIR/manuscript/figures
cd $ROOT_DIR
# data files: TODO: exact sources where available
# - extracted_bills_with_sponsors.json from DSSG / sunlight foundation
# - malp_individual.tab (from the malp authors' website) # - legislators.csv (legislators and ids from sunlight foundation matched with # malp ids (I can't find the original legislator data from sunlight anymore)
# Set up the database
cd etl/database
## Import bill data
_ : $DTA_DIR/initial_data/extracted_bills_with_sponsers.json
python import_bills.py
## Write metadata to csv file and write bill ids to separate txt file
$DTA_DIR/bill_metadata.csv $DTA_DIR/bill_ids.txt: $DTA_DIR/initial_data/extracted_bills_with_sponsers.json
python extract_bill_metadata.py
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Generate the alignments
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Generate the main alignment dataset
### Get the alignments (this takes about 3 weeks on 120 cores (requries HPC with pbs job scheduler)
$ALIGN_DTA/alignments.csv: $DTA_DIR/bill_ids.txt
# All these three can be run in parallel:
# Generates the actual alignments
python generate_alignments/run_hpc.py
# Collects the output files from each job and writes them to main output
# files
python generate_alignments/process_results.py $ALIGN_DTA/alignments \
$ALIGN_DTA/alignments.csv
python generate_alignments/process_results.py $ALIGN_DTA/bill_status \
$ALIGN_DTA/bill_status.csv
## Postprocess it (boilerplate weighting) and duplicate removal (yes that's a hack)
$ALIGN_DTA/alignments_notext.csv $ALIGN_DTA/alignments.csv: \
$ALIGN_DTA/alignments_with_dups.csv
python process_alignments.py
## Generate the ncsl alignments (full similarity matrix within parent topics
## for all matched bills)
### Steps to generate the ncsl dataset
$NCSL_DTA/checked_urls.csv:
python etl/ncsl_tables/get_table_urls.py
### Sample from them
$NCSL_DTA/sampled_urls.csv: $NCSL_DTA/checked_urls.csv
Rscript etl/ncsl_tables/sample_tables.R
### Sampled tables are then processed by hand and stored in
# $NCSL_DTA/ncsl_data_from_sample.csv
### Get the bills in the ncsl tables that are also in our database
$$NCSL_DTA/ncsl_data_from_sample_matched.csv NCSL_DTA/matched_ncsl_bill_ids.txt: \
$NCSL_DTA/ncsl_data_from_sample.csv $DTA_DIR/bill_metadata.csv \
$NCSL_DTA/states.csv
Rscript etl/ncsl_tables/match_bills.R
### Generate the alignments
$ALIGN_DTA/ncsl_alignments: $NCSL_DTA/matched_ncsl_bill_ids.txt
python generate_alignments/ncsl_bill_pairs.py
### Postprocess (boilerplate weighting)
$ALIGN_DTA/ncls_alignments_notext.csv: $ALIGN_DTA/ncls_alignments.csv
python generate_alignments/process_ncls_alignments.py
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Descriptives
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## Bill database overview (how many bills per state)
$FIGURES/year_count_by_state.png: data/bill_ids.txt data/bill_metadata.csv
Rscript analysis/exploratory/bill_meta_exp.R
## Exploration of alignment data
$FIGURES/alignment_score_distribution.png $TABLES/alignments_descriptives.yml: \
$ALIGN_DTA/alignments_notext.csv
Rscript analysis/exploratory/alignment_exploration.R
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Analysis
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
## NCSL
### Generate the cosine similarity scores
### Requires elastic search connection
$NCSL_DTA/cosine_similarities.csv: $NCSL_DTA/matched_ncsl_bill_ids.txt
python etl/ncsl_cosim/ncsl_cosim.py
### Figures and tables
$FIGURES/ncsl_pr_cosm.png $FIGURES/ncsl_pr_nosplit.png \
$TABLES/ncsl_auc.tex $TABLES/ncsl_bs_reg.tex $TABLES/prec_rec_distri.txt: \
$ALIGN_DTA/ncsl_alignments_notext.csv \
$NCSL_DTA/ncsl_data_from_sample_matched.csv \
$NCSL_DTA/cosine_similarities.csv
Rscript analysis/ncsl/ncsl_analysis.R
## Ideology
### Prep dataset
$DTA_DIR/ideology_analysis/ideology_analysis_input.RData: $ALIGN_DTA/alignments_notext.csv \
$DTA_DIR/bill_metadata.csv
Rscript analysis/ideology/make_ideology_dataset.R
### Run the bootstrapped regressions (requries HPC with pbs job scheduler)
$IDEO_DIR/regression_results.RData: $IDEO_DIR/ideology_analysis_input.RData
python analysis/ideology/gen_bs_jobs.py base # Submits base model job
python analysis/ideology/gen_bs_jobs.py all # Submits bootstrap jobs
Rscript analysis/ideology/ideology_regression.R # Collects outputs
## Diffusion analysis
$TABLES/diffusion_regression_results.tex: $ALIGN_DTA/alignments_notext.csv \
$DTA_DIR/bill_metadata.csv $DTA_DIR/dhb2015apsr-networks.csv
Rscript analysis/diffusion/diffusion_analysis.R
## Parisanship application
$TABLES/partisanship_dyad_distribution.tex $FIGURES/partisanship_score_distribution.png: $DTA_DIR/ideology_analysis/ideology_analysis_input.RData
Rscript analysis/partisanship/partisanship.R
## Additional stuff for new faces presentation
## Most common alignments
all_alignments.p: $ALIGN_DTA/alignments.csv
python analysis/exploratory/common_alignments.py
## Proportion aligned distribution
## in alignment_exploration.R
## import alignments to sql database
_ : $ALIGN_DTA/alignments_notext.csv $ALIGN_DTA/alignments.csv:
psql -U flinder -d text_reuse -f etl/sql/import_alignments.sql