From 437ce7d932ca235f1c7f919d4311d8501f657d0d Mon Sep 17 00:00:00 2001
From: sapolikanov <91873604+sapolikanov@users.noreply.github.com>
Date: Mon, 26 Aug 2024 09:09:29 -0400
Subject: [PATCH] Add comments to two new .qmd (#15)

---
 scripts/data_building.qmd | 41 ++++++++++++++++++++++++++++++++++++++-
 scripts/raw_data_prep.qmd |  6 +++++-
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/scripts/data_building.qmd b/scripts/data_building.qmd
index 8098507..4fa33fb 100644
--- a/scripts/data_building.qmd
+++ b/scripts/data_building.qmd
@@ -21,6 +21,12 @@ source(here::here("utilities", "check_packages.R"))
 conflicts_prefer(dplyr::filter)
 ```
 
+In this script, data is imported from a variety of sources and processed to be used in the analysis. The data is stored in the `data/data_built` folder.
+
+I refer the reader to the main `paper/paper.qmd` file for the description of data sources. I provide minimal comments here.
+
+We start by loading the data.
+
 ```{r}
 #| label: load-data
 
@@ -103,6 +109,8 @@ conflicts_prefer(dplyr::filter)
                                      sheet = 3)
 ```
 
+Aggregate exit poll data is homogenized to be compatible with raw data from `raw_data_prep.qmd` and to have better-formated country- and city- (voting stations) names.
+
 ```{r}
 #| label: clean-exit-poll
 
@@ -132,6 +140,8 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+Raw data is aggregated to voting-station level. 
+
 ```{r}
 #| label: clean-exit-poll-raw
   
@@ -190,11 +200,14 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+This dictionary is needed for conversion between country names as provided by the Russian Ministry of Foreign  
+Affairs and those in the international code dictionary.
+
 ```{r}
 #| label: clean-uik-dictionary
 
   ## Convert countrynames in Russian provided by the Russian Ministry of Foreign  
-  ## Affairs to match with those int the international code dictionary
+  ## Affairs to match with those in the international code dictionary
   uik_dict_clean <- uik_dict |> 
     mutate(uik = as.character(uik), # For merges
            country_compatible = case_when(
@@ -246,6 +259,8 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+Official results are modified to be compatible with exit poll data. Variable names are translated and the results are presented in shares and not percent.
+
 ```{r}
 #| label: clean-off-results
   
@@ -319,6 +334,8 @@ conflicts_prefer(dplyr::filter)
 
 ```
 
+Yandex data is not in fact used, but I retain this section for future potential. The data is an alternative to migration measures used in the paper.
+
 ```{r}
 #| label: clean-yandex
 
@@ -338,6 +355,8 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+Bilateral migration data is subset to years of interest, destination and origin.
+
 ```{r}
 #| label: clean-bilat-migration
   
@@ -352,6 +371,8 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+The same is done for international migrant stock data.
+
 ```{r}
 #| label: clean-int-migrant-stock
 
@@ -371,6 +392,8 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+I translate FSB country names to international country codes by hand due to data idionsyncracy. 
+
 ```{r}
 #| label: clean-fsb-migration
 
@@ -552,6 +575,8 @@ conflicts_prefer(dplyr::filter)
 # No Burundi, Central African Republic, Paraguay, Guinea-Bissau, North Macedonia
 ```
 
+Simple pre-processing (subsetting, countrycodes) is applied to religion data.
+
 ```{r}
 #| label: clean-religion
   
@@ -571,6 +596,8 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+I select variables of interest from the QoG dataset and homogenize countrycodes.
+
 ```{r}
 #| label: clean-qog
   
@@ -592,6 +619,8 @@ conflicts_prefer(dplyr::filter)
   
 ```
 
+I select variables of interest from the ATOP dataset and homogenize countrycodes.
+
 ```{r}
 #| label: clean-atop
 
@@ -619,6 +648,8 @@ conflicts_prefer(dplyr::filter)
                                 labels = c("0", "1", "2", "3", "4")))
 ```
 
+I rename and homogenize countrycodes in trade data.
+
 ```{r}
 #| label: clean-trade
 
@@ -643,6 +674,8 @@ conflicts_prefer(dplyr::filter)
     drop_na(countrycode_n, countrycode_c)
 ```
 
+The same is done with the geodesic distance data.
+
 ```{r}
 #| label: clean-geodist
 
@@ -657,6 +690,8 @@ conflicts_prefer(dplyr::filter)
     select(countrycode_n, countrycode_c, dist, distcap, distw, distwces)
 ```
 
+I then merge all datasets sequentially (for ease of troubleshooting).
+
 ```{r}
 #| label: merge data
   
@@ -684,6 +719,8 @@ conflicts_prefer(dplyr::filter)
            haritonov_full = if_else(countrycode_c == "TUR", haritonov_cec, haritonov_full))
 ```
 
+Since output data is at voting-station level, I aggregate to country-level in the next step. For some variables this means summing up the values, for others - taking the average and for some, retaining the constant.
+
 ```{r}
 #| label: aggregate to country-level
 
@@ -735,6 +772,8 @@ conflicts_prefer(dplyr::filter)
            spoiled_dist = spoiled_full - spoiled_ep)
 ```
 
+Lastly I export all resulting datasets.
+
 ```{r}
 #| label: save-data
   
diff --git a/scripts/raw_data_prep.qmd b/scripts/raw_data_prep.qmd
index 6b495ef..8349e84 100644
--- a/scripts/raw_data_prep.qmd
+++ b/scripts/raw_data_prep.qmd
@@ -20,7 +20,7 @@ source(here::here("utilities", "check_packages.R"))
 ```
 
 ```{r}
-#| label: load-cdata
+#| label: load-data
 
 # Load exit poll raw data
 ep_raw <- read_excel(here("data", "data_raw", "exitpoll_individual", 
@@ -28,6 +28,8 @@ ep_raw <- read_excel(here("data", "data_raw", "exitpoll_individual",
                      sheet = 4, guess_max = 69262)
 ```
 
+This short script by and large achieves two goals - translating input and variable names in Russian into English and cross-referencing country names into universal country code format for future merging.
+
 ```{r}
 #| label: data-prep
 
@@ -134,6 +136,8 @@ ep_raw_clean <- ep_raw |>
             id = row_number())
 ```
 
+When this is done, I write data to a `data_built/` folder. 
+
 ```{r}
 #| label: save-ep-raw-data