-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2a_data_wrangling XLSForms.R
165 lines (113 loc) · 5.77 KB
/
2a_data_wrangling XLSForms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
##########################################################################################
## 2 Data wrangling (& Transformation) ---------------------------------------------------
## Using the dplyr package to modify Excel documents into XLS Form compatible documents
##########################################################################################
## Author: Abel Gelman
##########################################################################################
# A - Extract choices in XLSForm-compatible format:
# The partner shared a PDF with the questionnaire they wanted us to code
# A 1st step, not covered here, was to trasnform the PDF in Excel format
# The resulting Excel document had the questionnaire spread in 86 tabs. This is the document we import into R
# Load Packages -----------------------------------------------------------
library(dplyr)
library(readxl)
library(tidyverse)
# Set path to Excel file --------------------------------------------------
flocert_path <- "/Users/Abel/Documents/CartONG/R_webinar/R_training/flocert_1.xlsx"
# Import the 1st sheet of the Excel file (out of the 81)----------------------------------
x <- read_xlsx(flocert_path,
sheet = "Table 1",
skip = 4, # skip 1st 4 rows of the xlsx
col_types = "text", # force all variables to "character"
col_names = TRUE) # 1st row as column names
view(x) # Visualize how the document was imported
# Import the rest of the Excel document sheets & compile them in on single dataframe --------
for (i in 2:81){ # The Excel file has 81 sheets. Iterate from the second sheet to the last (note the 1st sheet is alredy imported )
tempo <- read_xlsx(flocert_path,
sheet = paste("Table", i), # changes the sheet number to import in every iteration
col_types = "text", # force all variables to "character"
col_names = TRUE)
x <- bind_rows(x, tempo) # add each page at the bottom of the previous one, making one only single work sheet
}
view(x) # Check how the document looks
colnames(x) # check column names
# Extract answers for the xlsform options tab
y <- x %>%
select(c(3, 5,6,7,8,9)) # selects only the useful columns: question number +
view(y)
colnames(y)
y <- y %>%
filter(`CC£No.` != "CC£No.", !is.na(`Rank£1`)) %>% #filter out rows that are not needed
pivot_longer(-`CC£No.`) %>% # pivot the table to have 1 row per answer (choice in xls forms)
filter(!is.na(value)) %>% # drop rows that don't have any answers
# change into new columns that are XLSform compatible
transmute(list_name = paste("s", gsub(".", "_" ,`CC£No.`, fixed = T), sep = ""), # create list names with the questions numbers
name = substr(name, nchar(name), nchar(name)), # only keep the rank numbers, not the word rank
label = value) # change the name of the column label
view(y)
# export as csv - document is ready to copy-paste into the xlsform
write.csv(y, "/Users/Abel/Documents/CartONG/R_webinar/R_training/flocert_2.csv")
##########################################################################################
## Complex XLS Form formulas
# extract criteria column -------------------------------------------------
# Within this project, the "criteria" groups questions across sections of the form
# We need to calculate the average rank (response values) for questions of criteria "D"
# Identify questions names and their criteria letter
# Starting from the original dataframe with the questions and answers, format the data:
yy <- x %>%
select(c(3,11)) %>% # Keep only the columns with the question number and criteria type
filter(`CC£No.` != "CC£No.") %>% # filter out rows with no values
rename(criteria_type = `Criteria£Type`,# rename columns
list_name = `CC£No.`) %>%
filter(!is.na(criteria_type )) %>% #filter out rows without criteria type
transmute(list_name = paste("s", gsub(".", "_", list_name, fixed = T), sep = ""),# change format of question number to match XLS Form
criteria_type = criteria_type)
write.csv(yy, "flocert_criteria.csv") #export output ready to copy paste into XLS Form
# make formula to calculate results averages for questions of criteria "D"
# create columns with XLSForm formulas for the numerator and denominator of the average formula
zz <- yy %>%
mutate(denominator = paste("if(${", list_name, "}!='',1,0)", sep = ''), # write XLS compatible formula
numerator = paste("if(${", list_name, "}!='', ${", list_name, "},0)", sep = ''),
section = substr(list_name, 2,2))
view(zz)
# Global development average
d_glb_denominator <- NULL #Creates an empty vector paste the elements of the denominator elements of the formula
for(l in 1:nrow(zz)){ # Loop over zz
if (zz[l,2] == 'D') # selects only the lines that have criteria "D"
{d_glb_denominator <- paste(d_glb_denominator, zz[l,3],' + ',sep = '') # adds to the vector each corresponfing formula
}
}
d_glb_denominator
d_glb_numerator <- NULL
for(l in 1:nrow(zz)){
if (zz[l,2] == 'D')
{d_glb_numerator <- paste(d_glb_numerator, zz[l,4],' + ',sep = '')
}
}
d_glb_numerator
#section 3
d_3_denominator <- NULL
for(l in 1:nrow(zz)){
if (zz[l,2] == 'D' & zz[l, 5] == "3")
{d_3_denominator <- paste(d_3_denominator, zz[l,3],' + ',sep = '')
}
}
d_3_numerator <- NULL
for(l in 1:nrow(zz)){
if (zz[l,2] == 'D' & zz[l, 5] == 3)
{d_3_numerator <- paste(d_3_numerator, zz[l,4],' + ',sep = '')
}
}
#section 4
d_4_denominator <- NULL
for(l in 1:nrow(zz)){
if (zz[l,2] == 'D' & zz[l, 5] == "3")
{d_4_denominator <- paste(d_4_denominator, zz[l,3],' + ',sep = '')
}
}
d_4_numerator <- NULL
for(l in 1:nrow(zz)){
if (zz[l,2] == 'D' & zz[l, 5] == 3)
{d_4_numerator <- paste(d_4_numerator, zz[l,4],' + ',sep = '')
}
}