ECDC.Rmd

---
title: "ECDC Vaccination data"
author: "FD"
output: 
  html_document: 
      code_folding: hide
      toc: TRUE
      toc_float: TRUE
      self_contained: no
editor_options:
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
rm(list = ls())
```

```{r, include = FALSE}
dlData <- FALSE
# Whether to download the data again

today <- Sys.Date()
```

# Initializations

## Graphical parameters

```{r}
# Colors
colPop <- gray(0.8) # Unvaccinated
colComplet1 <- "#9B2500"
col1D1 <- "#FF6939"
colComplet2 <- "#044063"
col1D2 <- "#4F92BA"
```

## Load data and clean data

```{r}
# Load data 
# Soure : https://www.ecdc.europa.eu/en/publications-data/data-covid-19-vaccination-eu-eea
URL <- "https://opendata.ecdc.europa.eu/covid19/vaccine_tracker/csv/data.csv"


dataFile <- paste0("data/edcdc-vaccination_", today, ".csv")
if(dlData){
  download.file(URL, dataFile)
}
dat.ECDC <- read.csv(dataFile, stringsAsFactors = FALSE)
head(dat.ECDC)


# Remove regional data, keep only country data
dat.ECDC <- dat.ECDC[is.element(dat.ECDC$Region, dat.ECDC$ReportingCountry), ]


# Load country codes
# Source: https://gist.github.com/radcliff/f09c0f88344a7fcef373
# Modified for Greece (was GR, changed into EL)
cc <- read.csv("data/countryCodes.csv")
# Turn into dictionary
dic.cc <- cc$English.short.name.lower.case
names(dic.cc) <- cc$Alpha.2.code

# Add full name of the countries
dat.ECDC$Country <- dic.cc[dat.ECDC$ReportingCountry]

# Clean memory
rm(dic.cc, cc)

# Check age groups for all countries
table(dat.ECDC$Country, dat.ECDC$TargetGroup)
# Germany, The Netherlands and Liechtenstein do not share age data

# Define age classes
ac1 <- data.frame(ageClass = c("Age0_4", "Age5_9", "Age10_14", "Age15_17", "Age18_24", "Age25_49", "Age50_59", "Age60_69", "Age70_79", "Age80+"))

ac2 <- data.frame(ageClass = c("Age<18", "Age18_24", "Age25_49", "Age50_59", "Age60_69", "Age70_79", "Age80+"))

ac1$minAge <- c(0, 5, 10, 15, 18, 25, 50, 60, 70, 80)
ac1$maxAge <- c(4, 9, 14, 17, 24, 49, 59, 69, 79, 100)
ac1$ageWidth <- ac1$maxAge - ac1$minAge + 1

ac2$minAge <- c(0, 18, 25, 50, 60, 70, 80)
ac2$maxAge <- c(17, 24, 49, 59, 69, 79, 100)
ac2$ageWidth <- ac2$maxAge - ac2$minAge + 1


# Initialize the new dataset
newdat <- data.frame("TargetGroup" = character(0), 
                     "FirstDose" = numeric(0), 
                     "SecondDose" = numeric(0), 
                     "Population" = numeric(0), 
                     "Denominator" = numeric(0), 
                     "Country" = character(0), 
                     "ReportingCountry" = character(0), 
                     "minAge" = numeric(0), 
                     "maxAge" = numeric(0),
                     "ageWidth" = numeric(0), 
                     "YearWeekISO" = character(0))

#thedate <- max(dat.ECDC$YearWeekISO)
#thedate
#c1 <- "ES"

for(thedate in sort(unique(dat.ECDC$YearWeekISO))){
  for(ctr in unique(dat.ECDC$ReportingCountry)){

  tmp <- dat.ECDC[dat.ECDC$ReportingCountry == ctr & dat.ECDC$YearWeekISO <= thedate, ]
  
  if(nrow(tmp) > 1){
    # If there are data at this date!
    
      # Compute cumulated values
  agg <- aggregate(x = tmp[, c("FirstDose", "SecondDose")], by = list(TargetGroup = tmp[, "TargetGroup"]), FUN = sum)
  
  agg2 <- aggregate(x = tmp[, c("Population", "Denominator", "Country", "ReportingCountry")], by = list(TargetGroup = tmp[, "TargetGroup"]), FUN = unique)
  
  agg <- merge(agg, agg2, by = "TargetGroup")
  
  # If there is no line for Age<18, add it
  if(all(agg$TargetGroup != "Age<18")){
    agg <- rbind(agg, c("Age<18", 0, 0, agg[agg$TargetGroup == "ALL", "Population"], NA, agg[agg$TargetGroup == "ALL", "Country"], agg[agg$TargetGroup == "ALL", "ReportingCountry"]))
  }
  
  # Add Denominator to Age<18 if it is missing
  if(is.na(agg[agg$TargetGroup == "Age<18", "Denominator"])){
    agg[agg$TargetGroup == "Age<18", "Denominator"] <- as.numeric(agg[agg$TargetGroup == "ALL", "Population"]) - as.numeric(agg[agg$TargetGroup == "ALL", "Denominator"])
  }
  
  # Subselect the age class data that we need
  # Detailed age classes for children
  ac.children <- setdiff(ac1$ageClass, ac2$ageClass)
  # Subset of the data for these age classes
  tagg <- agg[is.element(agg$TargetGroup, ac.children), ]
  # NOTE: consider using "any" instead of "all" here
  if(all(is.na(tagg$Denominator))){
    # If we are missing all the population sizes, use the other age classes (ac2)
    agg <- agg[is.element(agg$TargetGroup, ac2$ageClass), ]
    # Add age class information
    agg <- merge(agg, ac2, by.x = "TargetGroup", by.y = "ageClass", all.x = TRUE)
  }else{
    # But if we have some values, use them
    agg <- agg[is.element(agg$TargetGroup, ac1$ageClass), ]
    # Add age class information
    agg <- merge(agg, ac1, by.x = "TargetGroup", by.y = "ageClass", all.x = TRUE)
  }
  agg$YearWeekISO <- thedate
  
  # Add to the new dataset
  newdat <- rbind(newdat, agg)
  }

  }
}

# Made numerical values numeric again
for(col in c("FirstDose", "SecondDose", "Population", "Denominator", "minAge", "maxAge")){
  newdat[, col] <- as.numeric(newdat[, col])
}

unique(newdat$YearWeekISO)

# Select a late date 
# (not necessarily the final one, because may be incomplete)
newdat.final <- newdat[newdat$YearWeekISO == "2021-W32", ]

# Count lines per country
tb <- table(newdat.final$Country)
# Countries without age data
missingAge <- names(tb[tb == 1])
missingAge

# Remove them from the final dataset
newdat.final <- newdat.final[!is.element(newdat.final$Country, missingAge), ]

newdat.final
```


# Plot age pyramid

```{r}
# 2-letter country codes of the two countries to plot
c1 <- "FI"
c2 <- "IE"

byRec <- 10000 # Population size for the graduations

sameScale <- TRUE # Whether to plot the two countries on the same scale or not

# Which version of the plot:
# 1: unvaxxed outside
# 2: unvaxxed inside
version <- 2

# Subset of the data with these countries
tmp <- newdat.final[is.element(newdat.final$ReportingCountry, c(c1, c2)), ]

# Get max size of age class
tmp1 <- tmp[tmp$ReportingCountry == c1, ]
tmp2 <- tmp[tmp$ReportingCountry == c2, ]
xmax1 <- max(tmp1$Denominator / tmp1$ageWidth)
xmax2 <- max(tmp2$Denominator / tmp2$ageWidth)

xmax <- max(c(xmax1, xmax2))

# Rescale population sizes
if(sameScale){
  tmp1$RDenom <- tmp1$Denominator / xmax / tmp1$ageWidth
  tmp1$R1D <- tmp1$FirstDose / xmax / tmp1$ageWidth
  tmp1$R2D <- tmp1$SecondDose / xmax / tmp1$ageWidth
  
  tmp2$RDenom <- tmp2$Denominator / xmax / tmp2$ageWidth
  tmp2$R1D <- tmp2$FirstDose / xmax / tmp2$ageWidth
  tmp2$R2D <- tmp2$SecondDose / xmax / tmp2$ageWidth
}else{
  tmp1$RDenom <- tmp1$Denominator / xmax1 / tmp1$ageWidth
  tmp1$R1D <- tmp1$FirstDose / xmax1 / tmp1$ageWidth
  tmp1$R2D <- tmp1$SecondDose / xmax1 / tmp1$ageWidth
  
  tmp2$RDenom <- tmp2$Denominator / xmax2 / tmp2$ageWidth
  tmp2$R1D <- tmp2$FirstDose / xmax2 / tmp2$ageWidth
  tmp2$R2D <- tmp2$SecondDose / xmax2 / tmp2$ageWidth
}

# Name of the output file
fname <- paste0("pics/pyramid_", c1, "-", c2, "_", thedate, "_v", version, "_sameScale-", 1*sameScale, ".pdf")
    
# Open pdf
pdf(fname, width = 7.5, height = 7)
    
par(xpd = FALSE, family = "sans", mgp = c(2, 0.15, 0), tck = -0.02)
par(mar = c(6, 2.5, 4.5, 2.5))

# Initialize plot
plot(c(-1, 1), c(0, 100), type = "n", 
              axes = FALSE, xlab = "", ylab = "", 
         xaxs = "i")
    
    # Write Credits
    par(family = "mono")
    mtext(side = 1, line = 4.5, text = paste0("@flodebarre, adapted from @VictimOfMaths, ", today, " 
Data ECDC: https://opendata.ecdc.europa.eu/covid19/vaccine_tracker/csv/data.csv
Code: https://github.com/flodebarre/covid_vaccination/blob/main/ECDC.Rmd"), adj = 0, cex = 0.55, col = gray(0.5))
    par(family = "sans")
    
    for(ictr in c(1, 2)){
      
      ctr <- c(c1, c2)[ictr]

      # For each country / side of the plot
      factor <- (-1)^ictr
      colComplet <- get(paste0("colComplet", ictr))
      col1D <- get(paste0("col1D", ictr))
      
      tmpp <- get(paste0("tmp", ictr))
      fullCtr <- unique(tmpp$Country) # Full name
      
      for(ag in unique(tmpp$TargetGroup)){
        # For each age class, 
        # Subset of the data
        tmp <- tmpp[tmpp$TargetGroup == ag, ]

        # Plot Total population
        rect(xleft = factor * tmp$RDenom, ybottom = tmp$minAge, 
             xright = 0, ytop = tmp$maxAge + 1, 
             col = colPop, border = gray(0, 0))
        
        if(version == 1){
          # Vaccinated, 1 dose
          rect(xleft = factor * min(tmp$R1D, tmp$RDenom), ybottom = tmp$minAge, 
               xright = 0, ytop = tmp$maxAge + 1, 
               col = col1D, border = gray(0, 0))
          
          # Vaccinated, complete
          rect(xleft = factor * min(tmp$R2D, tmp$RDenom), ybottom = tmp$minAge, 
               xright = 0, ytop = tmp$maxAge + 1, 
               col = colComplet, border = gray(0, 0))
        }else{
          ## Full vaccination in the end
          # Vaccinated, 1 dose
          rect(xleft = factor * max(0, tmp$RDenom - tmp$R1D), ybottom = tmp$minAge, 
               xright = factor * tmp$RDenom, ytop = tmp$maxAge + 1, 
               col = col1D, border = gray(0, 0))
          
          # Vaccinated, complete
          rect(xleft = factor * max(0, tmp$RDenom - tmp$R2D), ybottom = tmp$minAge, 
               xright = factor * tmp$RDenom, ytop = tmp$maxAge + 1, 
               col = colComplet, border = gray(0, 0))
          
        }
        
        # Graduations for age class
        lines(c(factor, 0), rep(tmp$minAge, 2), col = "white", lwd = 1.5)
        # Age values
        par(xpd = TRUE)
        if(factor == -1){adjj <- 1}else{adjj <- 0}
        text(x = factor, y = tmp$minAge, labels = tmp$minAge, col = gray(0), adj = c(adjj, 0.25), cex = 0.9)
        par(xpd = FALSE)
        
      }
      # Add country legend
      par(xpd = TRUE)
      if(factor == -1){adjj <- 0}else{adjj <- 1}
      text(x = factor, y = 110, labels = fullCtr, adj = c(adjj, 0), cex = 1.3, font = 2)
      par(xpd = FALSE)
      
    }
    
    cexl <- 0.9 # Text size of legend of axes
    mtext(paste0("Population by year of age
(One rectangle: ", format(byRec, scientific = FALSE)," individuals)"), side = 1, line = 1.25, cex = cexl)
    
    par(xpd = FALSE)
    # Graduations
    wfine <- 0.3
    # Horizontal by year
    for(i in 0:110){
      lines(c(-1, 1), rep(i, 2), col = "white", lwd = wfine)
    }
    # Vertical by byRec (10000 for instance)
    for(i in seq(0, 2*10^6, by = byRec)){
      if(sameScale){
        abline(v = i/xmax, col = "white", lwd = wfine)
        abline(v = -i/xmax, col = "white", lwd = wfine)
      }else{
        abline(v = i/xmax2, col = "white", lwd = wfine)
        abline(v = -i/xmax1, col = "white", lwd = wfine)
      }
    }
    # Vertical separation of the two countries
    #abline(v = 0, col = gray(0), lwd = 1.5)
    lines(x = c(0, 0), y = c(0, max(tmpp$maxAge) + 1), col = 1, lwd = 1.5, lend = "butt")
    
    # Legend ages
    par(xpd = TRUE)
    
    yl <- 100   # y position of "Age" 
    text("Age", x = 1, y = yl, cex = cexl, adj = c(0.5, 1))
    text("Age", x = -1, y = yl, cex = cexl, adj = c(0.5, 1))
    
    
    # Plot legend (manually centered)
    par(family = "mono")
    legend(x = 0, y = 105, pch = 15, col = c(colComplet1, col1D1, colPop, colComplet2, col1D2, colPop), ncol = 2, legend = c("     2 doses", "     1 dose", " non vaccinated", "", "", ""), xjust = 0.29, yjust = 0, box.lwd = -1, text.font = 1, pt.cex = 2, cex = 0.8)
    
    # Add title 
    par(family = "sans")
    mtext(thedate, side = 3, line = 3, cex = 1.2, font = 2)

    dev.off()
    
    system(paste0("open ", fname))

```