From 437ce7d932ca235f1c7f919d4311d8501f657d0d Mon Sep 17 00:00:00 2001 From: sapolikanov <91873604+sapolikanov@users.noreply.github.com> Date: Mon, 26 Aug 2024 09:09:29 -0400 Subject: [PATCH] Add comments to two new .qmd (#15) --- scripts/data_building.qmd | 41 ++++++++++++++++++++++++++++++++++++++- scripts/raw_data_prep.qmd | 6 +++++- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/scripts/data_building.qmd b/scripts/data_building.qmd index 8098507..4fa33fb 100644 --- a/scripts/data_building.qmd +++ b/scripts/data_building.qmd @@ -21,6 +21,12 @@ source(here::here("utilities", "check_packages.R")) conflicts_prefer(dplyr::filter) ``` +In this script, data is imported from a variety of sources and processed to be used in the analysis. The data is stored in the `data/data_built` folder. + +I refer the reader to the main `paper/paper.qmd` file for the description of data sources. I provide minimal comments here. + +We start by loading the data. + ```{r} #| label: load-data @@ -103,6 +109,8 @@ conflicts_prefer(dplyr::filter) sheet = 3) ``` +Aggregate exit poll data is homogenized to be compatible with raw data from `raw_data_prep.qmd` and to have better-formated country- and city- (voting stations) names. + ```{r} #| label: clean-exit-poll @@ -132,6 +140,8 @@ conflicts_prefer(dplyr::filter) ``` +Raw data is aggregated to voting-station level. + ```{r} #| label: clean-exit-poll-raw @@ -190,11 +200,14 @@ conflicts_prefer(dplyr::filter) ``` +This dictionary is needed for conversion between country names as provided by the Russian Ministry of Foreign +Affairs and those in the international code dictionary. + ```{r} #| label: clean-uik-dictionary ## Convert countrynames in Russian provided by the Russian Ministry of Foreign - ## Affairs to match with those int the international code dictionary + ## Affairs to match with those in the international code dictionary uik_dict_clean <- uik_dict |> mutate(uik = as.character(uik), # For merges country_compatible = case_when( @@ -246,6 +259,8 @@ conflicts_prefer(dplyr::filter) ``` +Official results are modified to be compatible with exit poll data. Variable names are translated and the results are presented in shares and not percent. + ```{r} #| label: clean-off-results @@ -319,6 +334,8 @@ conflicts_prefer(dplyr::filter) ``` +Yandex data is not in fact used, but I retain this section for future potential. The data is an alternative to migration measures used in the paper. + ```{r} #| label: clean-yandex @@ -338,6 +355,8 @@ conflicts_prefer(dplyr::filter) ``` +Bilateral migration data is subset to years of interest, destination and origin. + ```{r} #| label: clean-bilat-migration @@ -352,6 +371,8 @@ conflicts_prefer(dplyr::filter) ``` +The same is done for international migrant stock data. + ```{r} #| label: clean-int-migrant-stock @@ -371,6 +392,8 @@ conflicts_prefer(dplyr::filter) ``` +I translate FSB country names to international country codes by hand due to data idionsyncracy. + ```{r} #| label: clean-fsb-migration @@ -552,6 +575,8 @@ conflicts_prefer(dplyr::filter) # No Burundi, Central African Republic, Paraguay, Guinea-Bissau, North Macedonia ``` +Simple pre-processing (subsetting, countrycodes) is applied to religion data. + ```{r} #| label: clean-religion @@ -571,6 +596,8 @@ conflicts_prefer(dplyr::filter) ``` +I select variables of interest from the QoG dataset and homogenize countrycodes. + ```{r} #| label: clean-qog @@ -592,6 +619,8 @@ conflicts_prefer(dplyr::filter) ``` +I select variables of interest from the ATOP dataset and homogenize countrycodes. + ```{r} #| label: clean-atop @@ -619,6 +648,8 @@ conflicts_prefer(dplyr::filter) labels = c("0", "1", "2", "3", "4"))) ``` +I rename and homogenize countrycodes in trade data. + ```{r} #| label: clean-trade @@ -643,6 +674,8 @@ conflicts_prefer(dplyr::filter) drop_na(countrycode_n, countrycode_c) ``` +The same is done with the geodesic distance data. + ```{r} #| label: clean-geodist @@ -657,6 +690,8 @@ conflicts_prefer(dplyr::filter) select(countrycode_n, countrycode_c, dist, distcap, distw, distwces) ``` +I then merge all datasets sequentially (for ease of troubleshooting). + ```{r} #| label: merge data @@ -684,6 +719,8 @@ conflicts_prefer(dplyr::filter) haritonov_full = if_else(countrycode_c == "TUR", haritonov_cec, haritonov_full)) ``` +Since output data is at voting-station level, I aggregate to country-level in the next step. For some variables this means summing up the values, for others - taking the average and for some, retaining the constant. + ```{r} #| label: aggregate to country-level @@ -735,6 +772,8 @@ conflicts_prefer(dplyr::filter) spoiled_dist = spoiled_full - spoiled_ep) ``` +Lastly I export all resulting datasets. + ```{r} #| label: save-data diff --git a/scripts/raw_data_prep.qmd b/scripts/raw_data_prep.qmd index 6b495ef..8349e84 100644 --- a/scripts/raw_data_prep.qmd +++ b/scripts/raw_data_prep.qmd @@ -20,7 +20,7 @@ source(here::here("utilities", "check_packages.R")) ``` ```{r} -#| label: load-cdata +#| label: load-data # Load exit poll raw data ep_raw <- read_excel(here("data", "data_raw", "exitpoll_individual", @@ -28,6 +28,8 @@ ep_raw <- read_excel(here("data", "data_raw", "exitpoll_individual", sheet = 4, guess_max = 69262) ``` +This short script by and large achieves two goals - translating input and variable names in Russian into English and cross-referencing country names into universal country code format for future merging. + ```{r} #| label: data-prep @@ -134,6 +136,8 @@ ep_raw_clean <- ep_raw |> id = row_number()) ``` +When this is done, I write data to a `data_built/` folder. + ```{r} #| label: save-ep-raw-data