Generate_Figure3_Visuals.Rmd

---
title: "Visuals for pyspi paper figure 3"
output: html_document
date: "2023-08-04"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message=F, warning=F)
```


# Visuals for pyspi paper figure 3

## Configure system

Load necessary packages:
```{r}
library(tidyverse)
library(feather)
library(glue)
library(reticulate)
library(cowplot)
theme_set(theme_cowplot())
require(plyr)
```

Define python version to use and import the `feather` function from the `pyarrow` module:

```{r}
python_to_use <- "/path/to/your/preferred/installation/of/python3"
reticulate::use_python(python_to_use)

# Import pyarrow.feather as pyarrow_feather
pyarrow_feather <- import("pyarrow.feather")
```

## Load and prepare data

Load SPI information and colours:
```{r}
SPI_info <- read.csv("data/SPI_info.csv")
SPI_module_colours <- read.csv("data/SPI_module_colours.csv") %>%
  dplyr::select(-X)

SPI_module_colours %>% knitr::kable() %>% kableExtra::kable_styling()
```

Load the SPI classification performance data per classification problem:
```{r}
# BasicMotions
BasicMotions_data_path <- "data/BasicMotions" # Change this to wherever you store your data for this repo
BasicMotions_metadata = pyarrow_feather$read_feather(glue("{BasicMotions_data_path}/BasicMotions_sample_metadata.feather"))
BasicMotions_TS_data <- pyarrow_feather$read_feather(glue("{BasicMotions_data_path}/processed_data/BasicMotions_TS.feather"))
BasicMotions_pyspi_data = pyarrow_feather$read_feather(glue("{BasicMotions_data_path}/processed_data/BasicMotions_pyspi_filtered_for_classification.feather")) %>%
  left_join(., BasicMotions_metadata, by="Sample_ID")
BasicMotions_classes = as.character(unique(BasicMotions_pyspi_data$group))

BasicMotions_main_SPI_wise <- pyarrow_feather$read_feather(glue("{BasicMotions_data_path}/processed_data/BasicMotions_main_SPI_wise_acc.feather"))
# Aggregate the SPI-wise accuracy results across resamples to get a mean and SD estimate
BasicMotions_main_SPI_wise_mean <- BasicMotions_main_SPI_wise %>%
  group_by(SPI) %>%
  dplyr::summarise(Mean_Accuracy = mean(Accuracy, na.rm = T),
            SD_Accuracy = sd(Accuracy, na.rm=T))
BasicMotions_null_SPI_wise <- pyarrow_feather$read_feather(glue("{BasicMotions_data_path}/processed_data/BasicMotions_null_SPI_wise_acc.feather"))
BasicMotions_main_full <- pyarrow_feather$read_feather(glue("{BasicMotions_data_path}/processed_data/BasicMotions_main_full_acc.feather"))

# Aggregate the SPI-wise accuracy results across resamples to get a mean and SD estimate
BasicMotions_main_full_mean <- BasicMotions_main_full %>%
  dplyr::summarise(Mean_Accuracy = mean(Accuracy, na.rm = T),
            SD_Accuracy = sd(Accuracy, na.rm=T))
BasicMotions_null_full <- pyarrow_feather$read_feather(glue("{BasicMotions_data_path}/processed_data/BasicMotions_null_full_acc.feather"))

#############################################
# SelfRegulationSCP1
EEG_data_path <- "data/SelfRegulationSCP1" # Change this to wherever you store your data for this repo
EEG_metadata = pyarrow_feather$read_feather(glue("{EEG_data_path}/SelfRegulationSCP1_sample_metadata.feather"))
EEG_TS_data <- pyarrow_feather$read_feather(glue("{EEG_data_path}/processed_data/SelfRegulationSCP1_TS.feather"))
EEG_pyspi_data = pyarrow_feather$read_feather(glue("{EEG_data_path}/processed_data/SelfRegulationSCP1_pyspi_filtered.feather")) %>%
  left_join(., EEG_metadata, by="Sample_ID") %>%
  dplyr::rename("group" = "cortical")
EEG_classes = as.character(unique(EEG_pyspi_data$group))

EEG_main_SPI_wise <- pyarrow_feather$read_feather(glue("{EEG_data_path}/processed_data/SelfRegulationSCP1_main_SPI_wise_acc.feather"))

# Aggregate the SPI-wise accuracy results across resamples to get a mean and SD estimate
EEG_main_SPI_wise_mean <- EEG_main_SPI_wise %>%
  group_by(SPI) %>%
  dplyr::summarise(Mean_Accuracy = mean(Accuracy, na.rm = T),
            SD_Accuracy = sd(Accuracy, na.rm=T))
EEG_null_SPI_wise <- pyarrow_feather$read_feather(glue("{EEG_data_path}/processed_data/SelfRegulationSCP1_null_SPI_wise_acc.feather"))
EEG_main_full <- pyarrow_feather$read_feather(glue("{EEG_data_path}/processed_data/SelfRegulationSCP1_main_full_acc.feather"))

# Aggregate the SPI-wise accuracy results across resamples to get a mean and SD estimate
EEG_main_full_mean <- EEG_main_full %>%
  dplyr::summarise(Mean_Accuracy = mean(Accuracy, na.rm = T),
            SD_Accuracy = sd(Accuracy, na.rm=T))
EEG_null_full <- pyarrow_feather$read_feather(glue("{EEG_data_path}/processed_data/SelfRegulationSCP1_null_full_acc.feather"))

#############################################
# Rest versus movie watching fMRI
restfilm_data_path <- "data/Rest_vs_Film_fMRI/" # Change this to wherever you store your data for this repo
restfilm_pyspi_data = pyarrow_feather$read_feather(glue("{restfilm_data_path}/processed_data/Rest_vs_Film_fMRI_pyspi_filtered.feather")) 
restfilm_TS_data <- pyarrow_feather$read_feather(glue("{restfilm_data_path}/processed_data/Rest_vs_Film_fMRI_fMRI_TS_Yeo7.feather"))
restfilm_metadata = pyarrow_feather$read_feather(glue("{restfilm_data_path}/Rest_vs_Film_fMRI_metadata.feather")) %>%
  semi_join(., restfilm_pyspi_data %>% dplyr::rename("Unique_ID" = "Sample_ID"))
restfilm_pyspi_data <- restfilm_pyspi_data %>%
  dplyr::rename("Unique_ID" = "Sample_ID") %>%
  left_join(., restfilm_metadata, by="Unique_ID") %>%
  dplyr::rename("group" = "Scan_Type")

restfilm_main_SPI_wise <- pyarrow_feather$read_feather(glue("{restfilm_data_path}/processed_data/Rest_vs_Film_fMRI_main_SPI_wise_acc.feather"))
# Aggregate the SPI-wise accuracy results across resamples to get a mean and SD estimate
restfilm_main_SPI_wise_mean <- restfilm_main_SPI_wise %>%
  group_by(SPI) %>%
  dplyr::summarise(Mean_Accuracy = mean(Accuracy, na.rm = T),
            SD_Accuracy = sd(Accuracy, na.rm=T))
restfilm_null_SPI_wise <- pyarrow_feather$read_feather(glue("{restfilm_data_path}/processed_data/Rest_vs_Film_fMRI_null_SPI_wise_acc.feather"))
restfilm_main_full <- pyarrow_feather$read_feather(glue("{restfilm_data_path}/processed_data/Rest_vs_Film_fMRI_main_full_acc.feather"))

# Aggregate the SPI-wise accuracy results across resamples to get a mean and SD estimate
restfilm_main_full_mean <- restfilm_main_full %>%
  dplyr::summarise(Mean_Accuracy = mean(Accuracy, na.rm = T),
            SD_Accuracy = sd(Accuracy, na.rm=T))
restfilm_null_full <- pyarrow_feather$read_feather(glue("{restfilm_data_path}/processed_data/Rest_vs_Film_fMRI_null_full_acc.feather"))

```

Compute p-values per SPI for each classification problem:
```{r}
calculate_p_values <- function(main_res, null_accuracy_df) {
  # Calculate p-values relative to null and apply Bonferroni correction for multiple comparisons.
  #         Parameters:
  #                 main_res (data.frame): Dataframe with mean accuracy estimates
  #                 null_accuracy_df (data.frame): Dataframe with null accuracy estimates

  #         Returns:
  #                 p_res (data.frame): Dataframe with p-values and Bonferroni-corrected p-values

  null_acc <- null_accuracy_df$Null_Accuracy
  p_res <- main_res %>%
    rowwise() %>%
    dplyr::mutate(p_value = 1 - sum(Mean_Accuracy > null_acc) / length(null_acc)) %>%
    ungroup() %>%
    dplyr::mutate(p_value_bonferroni = p.adjust(p_value, method="bonferroni"),
           significant = p_value_bonferroni < 0.05)
  
  return(p_res)
}

# BasicMotions
BasicMotions_SPI_wise_p_values <- calculate_p_values(main_res = BasicMotions_main_SPI_wise_mean,
                                           null_accuracy_df = BasicMotions_null_SPI_wise)

BasicMotions_full_p_values <- calculate_p_values(main_res = BasicMotions_main_full_mean,
                                       null_accuracy_df = BasicMotions_null_full)

# SelfRegulationSCP
EEG_SPI_wise_p_values <- calculate_p_values(main_res = EEG_main_SPI_wise_mean,
                                            null_accuracy_df = EEG_null_SPI_wise)

EEG_full_p_values <- calculate_p_values(main_res = EEG_main_full_mean,
                                        null_accuracy_df = EEG_null_full)

# Rest vs Film 
restfilm_SPI_wise_p_values <- calculate_p_values(main_res = restfilm_main_SPI_wise_mean,
                                                 null_accuracy_df = restfilm_null_SPI_wise)

restfilm_full_p_values <- calculate_p_values(main_res = restfilm_main_full_mean,
                                             null_accuracy_df = restfilm_null_full)

```

## Visualise data

### Histogram visuals for Figure 3 B,E,H

```{r}
plot_SPI_histogram <- function(p_value_df, all_SPI_acc, num_bins, x_min, x_max, xlab, legend_x, legend_y, SPI_to_plot=NULL, SPI_to_plot_name=NULL) {
  # Plot histograms of SPI performance individually and with the combination of all SPIs.
  #         Parameters:
  #                 p_value_df (data.frame): Dataframe with p-values and Bonferroni-corrected p-values
  #                 all_SPI_acc (double): Mean accuracy for all SPIs combined
  #                 num_bins (int): Number of bins for histogram
  #                 x_min (double): Minimum x value for histogram
  #                 x_max (double): Maximum x value for histogram
  #                 xlab (string): X-axis label
  #                 legend_x (double): X coordinate for legend
  #                 legend_y (double): Y coordinate for legend
  #                 SPI_to_plot (string): SPI to plot for comparison (OPTIONAL)
  #                 SPI_to_plot_name (string): Name of SPI to plot for comparison (OPTIONAL)

  #         Returns:
  #                 p (ggplot): Histogram plot
  p <- p_value_df %>%
    dplyr::mutate(Significance = ifelse(significant, "Significant SPIs", "Not Sig")) %>%
    ggplot(data=., mapping=aes(x=100*Mean_Accuracy, fill=Significance)) +
    xlab(xlab) +
    ylab("Proportion") +
    geom_histogram(color=NA, bins=num_bins, aes(y=after_stat(count)/sum(after_stat(count))),
                   position = "identity") +
    scale_fill_manual(values = c("Significant SPIs" = "skyblue",
                                 "Not Sig" = "gray80"),
                      limits = c("Significant SPIs"),
                      na.value = "gray80") +
    scale_x_continuous(expand = c(0.01, 0.01), limits = c(x_min, x_max)) +
    scale_y_continuous(expand = c(0, 0),
                       breaks = c(0.00, 0.05, 0.10),
                       labels = c("0.00", "0.05", "0.10")) +
    geom_vline(linewidth=1, key_glyph = "path",
               aes(xintercept = all_SPI_acc,
                   color="All SPIs")) +
    scale_color_manual(values = c("red")) +
    theme(legend.position=c(legend_x, legend_y),
          legend.title = element_blank(),
          legend.spacing.y = unit(-0.6, "cm"))
  if (!is.null(SPI_to_plot) ) {
    SPI_to_plot_val = 100*(subset(p_value_df, SPI==SPI_to_plot) %>% pull(Mean_Accuracy))
    p <- p + geom_vline(linewidth=1, key_glyph="path", aes(xintercept = SPI_to_plot_val,
                                                           color = SPI_to_plot_name)) +
      scale_color_manual(values = c("red", "black")) +
      theme(legend.spacing.y = unit(-0.8, "cm"))
  }
  
  return(p)
}


# BasicMotions
plot_SPI_histogram(p_value_df = BasicMotions_SPI_wise_p_values, 
                   all_SPI_acc = 100*BasicMotions_full_p_values$Mean_Accuracy, 
                   x_min = 20, x_max = 100, num_bins = 35, xlab="Average accuracy (%)",
                   legend_x = 0.05, legend_y = 0.8)
```

```{r}
# SelfRegulationSCP1
plot_SPI_histogram(p_value_df = EEG_SPI_wise_p_values, 
                   all_SPI_acc = 100*EEG_full_p_values$Mean_Accuracy, 
                   x_min = 45, x_max = 75, num_bins=28, xlab="Average accuracy (%)",
                   legend_x = 0.05, legend_y = 0.8)
```

```{r}
# Rest_vs_Film
plot_SPI_histogram(p_value_df = restfilm_SPI_wise_p_values, 
                   all_SPI_acc = 100*restfilm_full_p_values$Mean_Accuracy, 
                   x_min = 40, x_max = 100, num_bins=29, xlab="Average accuracy (%)",
                   legend_x = 0.05, legend_y = 0.7,
                   SPI_to_plot = "cov_EmpiricalCovariance",
                   SPI_to_plot_name = "Pearson correlation")
```

The data underlying these figures can be saved to CSV files for quick reference:
```{r}
# All SPIs combined
all_SPIs_combined <- do.call(plyr::rbind.fill, list(BasicMotions_full_p_values %>% dplyr::mutate(Problem = "Smartwatch activity"),
                               EEG_full_p_values %>% dplyr::mutate(Problem = "EEG state"),
                               restfilm_full_p_values %>% dplyr::mutate(Problem = "fMRI film"))) %>%
  mutate(SPI = "All") %>%
  dplyr::select(Problem, SPI, Mean_Accuracy, p_value_bonferroni, significant) %>%
  dplyr::mutate(Mean_Accuracy = 100*Mean_Accuracy) 

# Individual SPIs
individual_SPIs <- do.call(plyr::rbind.fill, list(BasicMotions_SPI_wise_p_values %>% dplyr::mutate(Problem = "Smartwatch activity"),
                               EEG_SPI_wise_p_values %>% dplyr::mutate(Problem = "EEG state"),
                               restfilm_SPI_wise_p_values %>% dplyr::mutate(Problem = "fMRI film"))) %>%
  dplyr::mutate(Mean_Accuracy = 100*Mean_Accuracy) %>%
  left_join(., SPI_info)  %>%
  dplyr::select(Problem, SPI, Mean_Accuracy, p_value_bonferroni, significant)

# Write to a CSV
plyr::rbind.fill(all_SPIs_combined, individual_SPIs) %>% write.table(., "data/Cliff_Fig3_histogram_raw_data.csv", row.names = F, col.names = T, sep=",")
```


### Violin plot visuals for Figure 3 C,F,I

Smartwatch activity dataset:
```{r, warning=F, message=F}
# Pairwise top feature violin plots
plot_most_discriminative_feature_pair <- function(pyspi_data, 
                                                  group_classes,
                                                  this_SPI,
                                                  ylabel,
                                                  violin_colors = c("#B384EE", "#E087F8")) {
  # Plot violin plots of the top feature pair for a given SPI.
  #         Parameters:
  #                 pyspi_data (data.frame): Dataframe with SPI values for the given dataset
  #                 group_classes (data.frame): Dataframe with group labels for the given dataset
  #                 this_SPI (string): SPI to plot
  #                 ylabel (string): Y-axis label
  #                 violin_colors (vector): Colors for the violin plots (OPTIONAL)

  #         Returns:
  #                 top_t_stat (data.frame): T-statistics for top-differing node combo
  #                 violin_plot (ggplot): Violin plot

  group_combos <- as.data.frame(t(combn(unique(group_classes), 2))) %>%
    dplyr::mutate(group_pair = paste0(V1, "__", V2), .keep="unused") %>%
    pull(group_pair)
  
  node_combos <- as.data.frame(t(combn(unique(c(pyspi_data$Node_from,pyspi_data$Node_to)),2))) %>%
    dplyr::mutate(node_pair_1 = paste0(V1, "__", V2), 
           node_pair_2 = paste0(V2, "__", V1), 
           .keep="unused") %>%
    pivot_longer(cols =c(node_pair_1, node_pair_2)) %>%
    pull(value)
  
  # Run each pairwise t-statistics
  t_stat_list <- list()
  i = 0
  for (group_com in group_combos) {
    group_one = str_split(group_com, "__")[[1]][1]
    group_two = str_split(group_com, "__")[[1]][2]
    for (node_com in node_combos) {
      node_one <- str_split(node_com, "__")[[1]][1]
      node_two <- str_split(node_com, "__")[[1]][2]
      if (node_one != node_two) {
        i <- i + 1
        
        combo_data <- pyspi_data %>%
          filter(SPI == this_SPI) %>%
          dplyr::mutate(Node_Combo = paste0(Node_from, "__", Node_to)) %>%
          filter(Node_from == node_one,
                 Node_to == node_two,
                 group %in% c(group_one, 
                              group_two))
        
        res = data.frame(Group_Combo = group_com, 
                         Node_Combo = node_com,
                         T_stat = t.test(value ~ group, data = combo_data)$statistic)
        
        t_stat_list[[i]] <- res
      }
    }
  }
  t_stat_df <- do.call(plyr::rbind.fill, t_stat_list)
  
  # Find largest magnitude t stat
  top_t_stat <- t_stat_df %>%
    ungroup() %>%
    filter(abs(T_stat) == max(abs(T_stat)))
  top_group_combo <- str_split(top_t_stat$Group_Combo, "__")[[1]]
  top_node_combo <- str_split(top_t_stat$Node_Combo, "__")[[1]]
  
  # Plot violins
  data_to_plot <- pyspi_data %>%
    filter(SPI == this_SPI,
           group %in% top_group_combo,
           Node_from == top_node_combo[1],
           Node_to == top_node_combo[2])
  
  p <- data_to_plot %>%
    ggplot(data=., mapping=aes(x=group, y=value, fill=group)) +
    ylab(ylabel) +
    geom_violin() +
    geom_boxplot(width=0.1, fill="gray40") +
    scale_fill_manual(values = violin_colors) +
    theme(axis.title.x = element_blank(),
          axis.title.y = element_text(size=15),
          axis.text.x = element_text(size=15)) +
    theme(legend.position = "none")
  
  return(list(top_t_stat=data_to_plot, violin_plot=p))
  
}

# # Find all pairwise combinations of activities
BasicMotions_top_pair <- plot_most_discriminative_feature_pair(pyspi_data = BasicMotions_pyspi_data,
                                      group_classes = BasicMotions_classes,
                                      this_SPI = "cce_kozachenko",
                                      ylabel = "Causally conditioned\nentropy, Kozachenko",
                                      violin_colors = c("#B384EE", "#3564A1"))
BasicMotions_top_pair$violin_plot
```

EEG state dataset:
```{r, warning=F, message=F}
# EEG SelfRegulationSCP1
EEG_top_pair <- plot_most_discriminative_feature_pair(pyspi_data = EEG_pyspi_data,
                                      group_classes = EEG_classes,
                                      this_SPI = "cce_gaussian",
                                      ylabel = "Causally conditioned\nentropy, Gaussian",
                                      violin_colors = c(alpha("#3068B8", 0.7), alpha("#BD0005", 0.7)))
EEG_top_pair$violin_plot
```
 
 
fMRI film dataset:
```{r, message=F, echo=F}
# Rest vs Film
restfilm_top_pair <- plot_most_discriminative_feature_pair(pyspi_data = restfilm_pyspi_data,
                                      group_classes = c("movie", "rest"),
                                      this_SPI = "reci",
                                      ylabel = "RECI",
                                      violin_colors = c("#CC87F8", "#97C777"))
restfilm_top_pair$violin_plot
```


The data underlying these figures can be saved to a CSV file for quick reference:
```{r}
# Compile CSV file with all this data viewable in one file
BasicMotions_top_pair$top_t_stat %>%dplyr::mutate(Problem = "Smartwatch activity") %>%
  plyr::rbind.fill(., EEG_top_pair$top_t_stat %>% dplyr::mutate(Problem = "EEG state")) %>%
  plyr::rbind.fill(., restfilm_top_pair$top_t_stat %>% dplyr::mutate(Problem = "fMRI film")) %>%
  select(Problem, group, Sample_ID, SPI, Node_from, Node_to, value)  %>%
  write.table(., "data/Cliff_Fig3_Violins_raw_data.csv", row.names = F, col.names = T, sep=",")
```


### Literature category-wise violin plot visuals for Supplementary Figure 3 A,C,E

```{r}
literature_category_order <- c('basic','distance','causal','infotheory','spectral','misc')

plot_literature_categories <- function(p_value_df, 
                                    y_min, y_max, 
                                    ylab = "Average accuracy (%)",
                                    full_SPI_acc) {
  # Plot SPI performance across literature categories as violin plots
  #         Parameters:
  #                 p_value_df (data.frame): Dataframe with p-values for the given dataset
  #                 y_min (numeric): Minimum y-axis value
  #                 y_max (numeric): Maximum y-axis value
  #                 ylab (string): Y-axis label
  #                 full_SPI_acc (numeric): Mean accuracy for the full set of SPIs
  #         Returns:
  #                 p (ggplot): Violin plot

  set.seed(127) # For geom_jitter reproducibility
  p <- p_value_df %>%
    left_join(., SPI_info) %>%
    filter(!is.na(Literature_category)) %>%
    select(-Module) %>%
    left_join(., SPI_module_colours %>% filter(Module_Type=="Literature"),
              by = c("Literature_category" = "Module")) %>%
    dplyr::mutate(Literature_category = factor(Literature_category, levels = literature_category_order)) %>%
    ggplot(data=., mapping=aes(x=Literature_category, y=100*Mean_Accuracy)) +
    geom_violin(aes(fill=Colour), scale="width") +
    scale_fill_identity() +
    geom_jitter(fill=alpha("black", 0.5), color=alpha("black", 0.5), stroke=0.3, width=0.1, size=1) +
    scale_y_continuous(limits=c(y_min,y_max), expand=c(0,0)) +
    ylab(ylab) +
    geom_hline(yintercept = full_SPI_acc, color="red") +
    theme(legend.position="none",
          axis.title.y = element_text(size=11),
          axis.text.y = element_text(size=10),
          axis.title.x = element_blank(),
          axis.text.x = element_blank())
  return(p)
}
```

Smartwatch activity dataset:
```{r}
plot_literature_categories(p_value_df = BasicMotions_SPI_wise_p_values, 
                        y_min = 20, y_max = 100, 
                        ylab = "Average accuracy (%)",
                        full_SPI_acc = 100*BasicMotions_full_p_values$Mean_Accuracy)
```

EEG state dataset:
```{r}
plot_literature_categories(p_value_df = EEG_SPI_wise_p_values, 
                        y_min = 46, y_max = 72, 
                        ylab = "Average accuracy (%)",
                        full_SPI_acc = 100*EEG_full_p_values$Mean_Accuracy)
```

fMRI film dataset:
```{r}
plot_literature_categories(p_value_df = restfilm_SPI_wise_p_values, 
                        y_min = 40, y_max = 100, 
                        ylab = "Average accuracy (%)",
                        full_SPI_acc = 100*restfilm_full_p_values$Mean_Accuracy)
```

### Hierarchical cluster-wise violin plot visuals for Supplementary Figure 3 B,D,F

```{r}
# Plot performance across hclust modules
plot_hclust_modules <- function(p_value_df, 
                                y_min, y_max, 
                                null_quantile,
                                ylab = "Average accuracy (%)",
                                full_SPI_acc) {
  # Plot SPI performance across clustering modules as violin plots
  #         Parameters:
  #                 p_value_df (data.frame): Dataframe with p-values for the given dataset
  #                 y_min (numeric): Minimum y-axis value
  #                 y_max (numeric): Maximum y-axis value
  #                 ylab (string): Y-axis label
  #                 full_SPI_acc (numeric): Mean accuracy for the full set of SPIs
  #         Returns:
  #                 p (ggplot): Violin plot

  p <- p_value_df %>%
    left_join(., SPI_info) %>%
    left_join(., SPI_module_colours %>% 
                filter(Module_Type=="HClust")) %>%
    filter(!is.na(Module)) %>%
    ggplot(data=., mapping=aes(x=Module, y=100*Mean_Accuracy)) +
    geom_violin(aes(fill=Colour), scale="width", width=0.6) +
    scale_fill_identity() +
    geom_jitter(fill=alpha("black", 0.5), color=alpha("black", 0.5), stroke=0.3, width=0.1, size=0.75) +
    scale_y_continuous(limits=c(y_min,y_max), expand=c(0,0)) +
    ylab(ylab) +
    xlab("Module") +
    # geom_hline(yintercept = null_quantile, color="black") +
    geom_hline(yintercept = full_SPI_acc, color="red") +
    theme(legend.position="none",
          axis.title = element_text(size=11),
          axis.text.y = element_text(size=10),
          axis.text.x = element_text(size=8))
  
  return(p)
}

# The data/SPI_stats_for_Fig3_histogram.csv file contains all data 
# included in these plots as well
```


Smartwatch activity dataset:
```{r}
plot_hclust_modules(p_value_df = BasicMotions_SPI_wise_p_values, 
                    y_min = 20, y_max = 100, 
                    ylab = "Average accuracy (%)",
                    full_SPI_acc = 100*BasicMotions_full_p_values$Mean_Accuracy)
```

EEG state dataset:
```{r}
plot_hclust_modules(p_value_df = EEG_SPI_wise_p_values, 
                    y_min = 46, y_max = 72, 
                    ylab = "Average accuracy (%)",
                    full_SPI_acc = 100*EEG_full_p_values$Mean_Accuracy)
```

fMRI film dataset:
```{r}
plot_hclust_modules(p_value_df = restfilm_SPI_wise_p_values, 
                    y_min = 40, y_max = 100, 
                    ylab = "Average accuracy (%)",
                    full_SPI_acc = 100*restfilm_full_p_values$Mean_Accuracy)
```