--- title: "Hierarchical Heatmaps (hhmR) Vignette" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Hierarchical Heatmaps (hhmR) Vignette} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r, include = FALSE, echo = FALSE, warning = FALSE, message = FALSE} library(dplyr) library(purrr) library(tidyr) library(ggplot2) library(patchwork) # Create example_migration # Define names of fake counties fake_counties = c("Greenridge","Windermoor","Bramblewood","Silverlake", "Thornbury","Maplewood","Hawthorne","Pinehurst", "Riverton","Meadowbrook","Fairhaven","Oakdale","Stonebridge", "Brookfield","Ashford","Glenville","Sunnyvale","Westfield") # Create region county lookup tables rc_lkp = data.frame(region = c(rep("North",3),rep("Midlands",5), rep("South West",4),rep("South East",6)), county = fake_counties) og_lkp = rc_lkp %>% setNames(c("Origin Region" ,"Origin County" )) dn_lkp = rc_lkp %>% setNames(c("Destination Region","Destination County")) # Create dataframe of fake migration data set.seed(1234) example_migration = expand.grid(fake_counties,fake_counties) %>% setNames(paste(c("Origin","Destination"),"County",sep=" ")) %>% full_join(og_lkp) %>% full_join(dn_lkp) %>% mutate(Migration = (1/rgamma(18*18, shape = 17, rate = 0.5)) %>% {. * 1000} %>% round()) example_migration[example_migration$`Origin County` == example_migration$`Destination County`,"Migration"] = example_migration[example_migration$`Origin County` == example_migration$`Destination County`,"Migration"] * 10 # Define names of fake counties fake_counties = c("Greenridge","Windermoor","Bramblewood","Silverlake", "Thornbury","Maplewood","Hawthorne","Pinehurst", "Riverton","Meadowbrook","Fairhaven","Oakdale","Stonebridge", "Brookfield","Ashford","Glenville","Sunnyvale","Westfield") # Create example time_series # Create dataframe of fake migration data set.seed(1234) example_time_series = data.frame(region = c(rep("North",3),rep("Midlands",5), rep("South West",4),rep("South East",6)), county = fake_counties, year_2011 = sample(1:10000,length(fake_counties)), year_2012 = sample(1:10000,length(fake_counties)), year_2013 = sample(1:10000,length(fake_counties)), year_2014 = sample(1:10000,length(fake_counties)), year_2015 = sample(1:10000,length(fake_counties))) %>% setNames(c("Region","County",2011:2015)) %>% pivot_longer(cols = `2011`:`2015`, names_to = "Year", values_to = "Immigration") %>% mutate(Year = as.numeric(Year)) example_time_series[sample(1:(length(fake_counties)*5),5),"Immigration"] = NA # Define cg cg = function(colour1, colour2, n = 15) { # Create a color palette function colour_func <- grDevices::colorRampPalette(c(colour1, colour2)) # Generate the color gradient colour_gradient <- colour_func(n) # Return colour gradient return(colour_gradient) } # Define decimal places decimalplaces <- function(x) { if (abs(x - round(x)) > .Machine$double.eps^0.5) { nchar(strsplit(sub('0+$', '', as.character(x)), ".", fixed = TRUE)[[1]][[2]]) } else { return(0) } } # Define exp_seq exp_seq = function(n,ln=15,exponent=2,round_values=TRUE,rmv_extremes=TRUE) { # Check variables have been entered correctly if (!is.numeric(exponent) || exponent < 1) { stop("`exponent` must be a numer greater than or equal to 1.") } # Create a sequence from 1 to n. # If `n` is specified as 1, the vector will be scaled to between 0 and 1. if (n == 1) { seq = seq(0, 1000, length.out = ln) round_values = FALSE } else { seq = seq(0, n, length.out = ln) } # Apply the logarithm to the sequence exp_seq = seq %>% {.^exponent} # Scale the sequence to the range [0, 1] min_val = min(exp_seq) max_val = max(exp_seq) one_seq = ((exp_seq - min_val) / (max_val - min_val)) # Scale sequence to n if (round_values) { scaled_one_seq = one_seq %>% {. * n} %>% round() } else if (n == 1) { scaled_one_seq = one_seq } else { scaled_one_seq = one_seq %>% {. * n} } # Option to remove zero and the maximum value (i.e. `n`) from the beginning and the end of the vector if (rmv_extremes) { scaled_one_seq = scaled_one_seq %>% .[2:(length(.)-1)] } # Return scaled sequence return(scaled_one_seq) } # Define hhm hhm = function(df,ylower,yupper,xlower,xupper,values,rm_diag=F,lgttl=NULL,bins=NULL,cbrks=NULL,cclrs=NULL,norm_lgd=F,lgdps=0,xttl_height=0.15,yttl_width=0.15) { # Define max value supplied to `values` if (rm_diag) { max_value = max(df[df[[xlower]] != df[[ylower]],values], na.rm = TRUE) } else { max_value = max(df[[values]], na.rm = TRUE) } # Check that supplied model inputs are compatible and won't cause errors if (!is.null(bins) && !is.null(cbrks)) { stop("The inputs bins and cbrks should not be supplied at the same time. bins is used to break the data into a specific number of groups with equal intervals between the min and max values. cbrks is used to manually break the data into groups based on the supplied thresholds. Please provide either one or the other.") } if (!is.null(bins) && !is.null(cclrs)) { if (bins != length(cclrs)) { stop("If both bins and cclrs are provideds, bins and cclrs must both be vectors with cclrs having a length equal to the value of bins.") } } if (!is.null(cbrks) && rm_diag) { if ( (min(cbrks) <= 0) || (max(cbrks) >= max_value) ) { stop(paste0("All values in cbrks must be between 0 and the largest value provided to `values`. In this instance rm_diag == TRUE, so only values not on the diagonal are considered. All values provided to cbrks should therefore be between greater than 0 and less than ",max_value,".")) } } if (!is.null(cbrks) && rm_diag == F) { if ( !is.null(cbrks) && (min(cbrks) <= 0) || (max(cbrks) >= max_value) ) { stop(paste0("All values in cbrks must be between 0 and the largest value provided to `values`. In this instance all values provided to cbrks should therefore be between greater than 0 and less than ",max_value,".")) } } if (!is.null(cbrks) && !is.null(cclrs)) { if ( length(cbrks) != (length(cclrs)-2) ) { stop("If both cbrks and cclrs are provided, cbrks and cclrs must both be vectors with cclrs having a length two longer than cbrks.") } } if (!is.null(cbrks) && norm_lgd) { if (min(range(cbrks)) < 0 || max(range(cbrks)) > 1) { stop("If normalising the values (norm_lgd == TRUE), all breaks provided to cbrks must be between 0 and 1.") } } if (!is.null(cbrks) && (cbrks %>% diff() %>% {. <= 0} %>% sum() %>% {. > 0})) { stop("Please ensure the values in cbrks are provided in ascending order.") } # Remove unwanted rows and format origin so geographies appear in alphabetical order df = df[,c(ylower,xlower,yupper,xupper,values)] # Define the groups to be shown along the x and y axes # If ordering of groups already defined via factor ordering, take this as the order # the groups should appear (top to bottom / left to right). # Otherwise, order groups alphabetically if (!is.null(df[[xupper]] %>% levels())) { xgrps = df[[xupper]] %>% levels() } else { xgrps = df[[xupper]] %>% unique() %>% sort() } if (!is.null(df[[yupper]] %>% levels())) { ygrps = df[[yupper]] %>% levels() } else { ygrps = df[[yupper]] %>% unique() %>% sort() } # If user specified to remove diagonal values, set all observations where ylower and xlower are identical to zero if (rm_diag) { df[df[[ylower]] == df[[xlower]],values] = NA } # Option to normalise values between 0-1 (only works if all values are positive) if (norm_lgd) { # If any values are negative, return error message if ((df[[values]] < 0) %>% sum(na.rm = TRUE) %>% {. > 0}) {stop("norm_lgd is only designed to be used if all values used to populate the heatmap are positive.")} # Otherwise normalise values df[[values]] = df[[values]] / max(df[[values]], na.rm = TRUE) # Unless a value other than zero is supplied (i.e. the user has manually specified a non-default value), set the number of decimal points shown in the legend to 3 if (lgdps == 0) { lgdps = 3 } } # Option to split legend into custom categories if (!is.null(cbrks)) { # If cbrks provided # Add the smallest value possible in R as a lower threshold to cbrks. # This ensures anything that is equal to, or less than, zero is included in the first group. cbrks = c(.Machine$double.xmin,cbrks) # Define names of custom breaks if (lgdps == 0) { brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ), cbrks %>% .[2:length(.)] %>% c(.,max_value), sep = "-")) } else if (norm_lgd) { brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,1 ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } else { brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,max_value) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } # Create discrete scale based on these custom breaks df[[values]] = df[[values]] %>% findInterval(cbrks) %>% {. + 1} %>% addNA() %>% factor(levels = 1:length(brk_nms), labels = brk_nms) } else if (!is.null(bins)) { # If bins provided # Assign breaks to be equidistant thresholds between zero and maximum observed values # Also add the minimum possible value above zero as the first break in the sequence cbrks = seq(0, max(df[[values]], na.rm = TRUE), length.out = bins - 1) %>% .[2:(length(.)-1)] %>% c(.Machine$double.xmin,.) # Define names of custom breaks if (lgdps == 0) { # If set to show whole numbers # Round all values other than the first one (which is the minimum possible value above zero), up to the nearest whole number cbrks = c(.Machine$double.xmin, cbrks %>% .[2:length(.)] %>% ceiling()) # Assign break names between zero and the maximum observed value in the data brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ), cbrks %>% .[2:length(.)] %>% c(.,max_value), sep = "-")) } else if (norm_lgd) { # If data has been normalised # Assign break names between 0 and 1 to the specified number of decimal points (lgdps) brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,1 ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } else { # If using non-rounded, non-normalised data # Assign break names between zero and the maximum observed value in the data to the specified number of decimal points (lgdps) brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,max_value) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } # Create discrete scale based on these custom breaks df[[values]] = df[[values]] %>% findInterval(cbrks) %>% replace(. == length(cbrks), (length(cbrks)-.Machine$double.xmin)) %>% {. + 1} %>% addNA() %>% factor(levels = 1:length(brk_nms), labels = brk_nms) } else { # Otherwise define consistent legend scale range lg_lims = df[[values]] %>% range(na.rm = TRUE) } # Define legend title (if not defined by user) if (is.null(lgttl) && norm_lgd) { lgttl = "Normalised\nValues" } else if (is.null(lgttl) && norm_lgd == F) { lgttl = "Values" } # Create empty list to populate with ggplot heatmaps pl = list() # Define vectors capturing the number of lower categories in each upper group xglns = df %>% group_split(!!rlang::sym(xupper)) %>% purrr::map(~ .[[xlower]] %>% unique() %>% length()) %>% unlist() yglns = df %>% group_split(!!rlang::sym(yupper)) %>% purrr::map(~ .[[ylower]] %>% unique() %>% length()) %>% unlist() # Counter to keep track of interations of nested for loop i = 0 # For each y-axis group for (ygrp in 1:length(ygrps)) { # For each x-axis group for (xgrp in 1:length(xgrps)) { # Increase interature counter by 1 i = i + 1 # Filter group-level migration data to only include origin and destination regions of interest sdf = df[df[[yupper]] == ygrps[ygrp] & df[[xupper]] == xgrps[xgrp],] # Order lower categories alphabetically sdf[[xlower]] = factor(sdf[[xlower]], levels = sdf[[xlower]] %>% unique() %>% sort() ) sdf[[ylower]] = factor(sdf[[ylower]], levels = sdf[[ylower]] %>% unique() %>% sort() %>% rev() ) # Define main plot p = ggplot(sdf, aes(.data[[xlower]], .data[[ylower]])) + geom_tile(aes(fill = .data[[values]]), show.legend = TRUE) + theme(plot.margin = unit(rep(0,4), "cm"), axis.text.x = element_text(angle = 90, hjust = 1.0, vjust = 0.3), axis.title.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5), axis.title.y = element_text(angle = 0, hjust = 0.5, vjust = 0.5), axis.ticks = element_blank()) + labs(x = xgrps[xgrp], y = ygrps[ygrp]) # Define colour scale if (!is.null(cbrks) && !is.null(cclrs)) { p = p + scale_fill_manual(name = lgttl, values = cclrs , drop = F, na.value = "white") } else if (!is.null(cbrks) && is.null(cclrs)) { p = p + scale_fill_manual(name = lgttl, values = cg("white","#08306B",(length(cbrks)+1)), drop = F, na.value = "white") } else if (is.null(cbrks) && !is.null(cclrs) && norm_lgd) { p = p + scale_fill_gradientn(name = lgttl, colours = cclrs , limits = c(0,1) , na.value = "white") } else if (is.null(cbrks) && !is.null(cclrs) && norm_lgd == F) { p = p + scale_fill_gradientn(name = lgttl, colours = cclrs , limits = lg_lims, na.value = "white") } else if (is.null(cbrks) && is.null(cclrs) && norm_lgd) { p = p + scale_fill_gradientn(name = lgttl, colours = cg("white","#08306B"), limits = c(0,1) , na.value = "white") } else if (is.null(cbrks) && is.null(cclrs) && norm_lgd == F) { p = p + scale_fill_gradientn(name = lgttl, colours = cg("white","#08306B"), limits = lg_lims, na.value = "white") } # To prevent legend showing NA values if rm_diag set to TRUE (in which case, diagonal set to NA), only show legend for plots that are not on the diagonal if (rm_diag && (sdf[[values]] %>% is.na() %>% sum() %>% {. > 0}) && (ygrp == xgrp) ) { p = p + theme(legend.position = "none") } # If bottom-left plot if (ygrp == length(ygrps) & xgrp == 1) { # Include provincia names on both axes p = p + theme(axis.title.x = element_blank(), axis.title.y = element_blank()) } else if (ygrp < length(ygrps) & xgrp == 1) { # If left-hand plot # Include provincia names on y-axis p = p + theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank()) } else if (ygrp == length(ygrps) & xgrp > 1) { # If bottom plot # Include provincia names on x-axis p = p + theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank()) } else { # If plot not on left of bottom edges of multiplot # Remove provincia names p = p + theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank()) } # Add ggplot to plot list pl[[i]] = p } } # Define plot heights and widths (including group titles) wds = c((sum(xglns)*yttl_width),xglns) hts = c(yglns,(sum(yglns)*xttl_height)) # Define plot spacer ps = plot_spacer() # Create empty lists to be populated with plot titles xttls = list() yttls = list() # Define plot titles for (xgrp in 1:length(xgrps)) { xttls[[xgrp]] = plt_ttl(xgrps[xgrp]) } for (ygrp in 1:length(ygrps)) { yttls[[ygrp]] = plt_ttl(ygrps[ygrp],axs="y") } # Create empty list to populate with both plot title and heatmap tiles in the correct order plts = list() # Define counters for subsetting plot title and heatmap lists, to ensure they are ordered correctly i = 1 j = 1 # For each group (row), assign each plot title, then the heatmap tiles within that row to plts list for (ygrp in 1:length(yttls)) { # Add plot title to list plts[[i]] = yttls[[ygrp]] # Add heatmap plots to list plts[(i+1):(i+length(xttls))] = pl[j:(j+length(xttls)-1)] # Adjust counters i = i + 1 + length(xttls) j = j + length(xttls) } # Add x-axis plots plts[[length(plts)+1]] = ps plts[(length(plts)+1):(length(plts)+length(xttls))] = xttls # Define final plot plt = patchwork::wrap_plots(plts, widths = wds, heights = hts, guides = "collect") # Return final plot return(plt) } # Define log_seq log_seq = function(n,ln=15,round_values=T,rmv_extremes=F) { # Create a sequence from 1 to n. # If `n` is specified as 1, the vector will be scaled to between 0 and 1. if (n == 1) { seq = seq(1, 1000, length.out = ln) round_values = F } else { seq = seq(1, n, length.out = ln-1) } # Apply the logarithm to the sequence log_seq = log(seq) # Scale the sequence to the range [0, 1] min_val = min(log_seq) max_val = max(log_seq) one_seq = ((log_seq - min_val) / (max_val - min_val)) # Reverse pattern of scale so breaks are focussed on lower rather than upper end of scale one_seq_rev = one_seq %>% {. - max(.)} %>% {. * -1} %>% rev() # Scale sequence to n if (round_values) { scaled_one_seq = one_seq_rev %>% {. * n} %>% round() %>% .[2:length(.)] %>% c(0,1,.) } else if (n == 1) { scaled_one_seq = one_seq_rev } else { scaled_one_seq = one_seq_rev %>% {. * n} %>% .[2:length(.)] %>% c(0,.Machine$double.xmin,.) } # Option to remove zero and the maximum value (i.e. `n`) from the beginning and the end of the vector if (rmv_extremes) { scaled_one_seq = scaled_one_seq %>% .[2:(length(.)-1)] } # Return scaled sequence return(scaled_one_seq) } # Define plt_ttl plt_ttl = function(ttl,axs="x",rotate_title=TRUE) { # If plotting on x-axis if (axs == "x") { # Place at top of plot p = ggplot(data.frame(x = 0:1, y = 0:1), aes(x = .data[["x"]], y = .data[["y"]])) + geom_point(col = "white") + coord_cartesian(xlim = c(0,1), ylim = c(0,1)) + theme_void() + theme(plot.margin = unit(rep(0,4), "cm")) # Speficy wehther title should be roated 90 degrees or not if (rotate_title) { p = p + geom_text(x = 0.495, y = 1.0, angle = 90, label = ttl, size = 4, hjust = 1) } else if (rotate_title == FALSE) { p = p + geom_text(x = 0.495, y = 0.5, angle = 0, label = ttl, size = 4, hjust = 1) } else { stop("rotate_title must be TRUE or FALSE.") } } else if (axs == "y") { # If plotting on y-axis # Place at left of plot p = ggplot(data.frame(x = 0:1, y = 0:1), aes(x = .data[["x"]], y = .data[["y"]])) + geom_point(col = "white") + coord_cartesian(xlim = c(0,1), ylim = c(0,1)) + theme_void() + theme(plot.margin = unit(rep(0,4), "cm")) # Speficy wehther title should be roated 90 degrees or not if (rotate_title) { p = p + geom_text(x = 1, y = 0.51, angle = 0, label = ttl, size = 4, hjust = 1) } else if (rotate_title == FALSE) { p = p + geom_text(x = 1, y = 0.51, angle = 90, label = ttl, size = 4, hjust = 1) } else { stop("rotate_title must be TRUE or FALSE.") } } # Return region name plot return(p) } # Define tshhm tshhm = function(df,lower,upper,times,values,sort_lower="alphabetical",lgttl=NULL,bins=NULL,cbrks=NULL,cclrs=NULL,norm_lgd=F,lgdps=0,na_colour=NULL,xttl_height=0.05,yttl_width=0.15) { # Define max value supplied to `values` max_value = max(df[[values]], na.rm = TRUE) # Check that supplied model inputs are compatible and won't cause errors if (!is.null(bins) && !is.null(cbrks)) { stop("The inputs bins and cbrks should not be supplied at the same time. bins is used to break the data into a specific number of groups with equal intervals between the min and max values. cbrks is used to manually break the data into groups based on the supplied thresholds. Please provide either one or the other.") } if (!is.null(bins) && !is.null(cclrs)) { if (bins != length(cclrs)) { stop("If both bins and cclrs are provideds, bins and cclrs must both be vectors with cclrs having a length equal to the value of bins.") } } if (!is.null(cbrks) && !is.null(cclrs)) { if ( length(cbrks) != (length(cclrs)-2) ) { stop("If both cbrks and cclrs are provided, cbrks and cclrs must both be vectors with cclrs having a length two longer than cbrks.") } } if (!is.null(cbrks) && norm_lgd) { if (min(range(cbrks)) < 0 || max(range(cbrks)) > 1) { stop("If normalising the values (norm_lgd == TRUE), all breaks provided to cbrks must be between 0 and 1.") } } if (!is.null(cbrks) && (cbrks %>% diff() %>% {. <= 0} %>% sum() %>% {. > 0})) { stop("Please ensure the values in cbrks are provided in ascending order.") } # Remove unwanted rows and format origin so geographies appear in alphabetical order df = df[,c(lower,upper,times,values)] # If no colour has been assigned for NA values, then remove them from the dataset if (is.null(na_colour)) { df = df %>% filter(!is.na(!!rlang::sym(values))) } # Define the groups to be shown along the y-axis # If ordering of groups already defined via factor ordering, take this as the order # the groups should appear (top to bottom). Otherwise, order groups alphabetically if (!is.null(df[[upper]] %>% levels())) { ygrps = df[[upper]] %>% levels() } else { ygrps = df[[upper]] %>% unique() %>% sort() } # Option to normalise values between 0-1 (only works if all values are positive) if (norm_lgd) { # If any values are negative, return error message if ((df[[values]] < 0) %>% sum(na.rm = TRUE) %>% {. > 0}) {stop("norm_lgd is only designed to be used if all values used to populate the heatmap are positive.")} # Otherwise normalise values df[[values]] = df[[values]] / max(df[[values]], na.rm = TRUE) # Unless a value other than zero is supplied (i.e. the user has manually specified a non-default value), set the number of decimal points shown in the legend to 3 if (lgdps == 0) { lgdps = 3 } } # Option to split legend into custom categories if (!is.null(cbrks)) { # If cbrks provided # Add the smallest value possible in R as a lower threshold to cbrks. # This ensures anything that is equal to, or less than, zero is included in the first group. cbrks = c(.Machine$double.xmin,cbrks) # Define names of custom breaks if (lgdps == 0) { brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ), cbrks %>% .[2:length(.)] %>% c(.,max_value), sep = "-")) } else if (norm_lgd) { brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,1 ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } else { brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,max_value) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } # Create backup version of origin variable so rows can still be ordered by row total or row means if (sort_lower != "alphabetical") { df[[paste0(values,"_old")]] = df[[values]] } # Create discrete scale based on these custom breaks df[[values]] = df[[values]] %>% findInterval(cbrks) %>% {. + 1} %>% addNA() %>% factor(levels = 1:length(brk_nms), labels = brk_nms) } else if (!is.null(bins)) { # If bins provided # Assign breaks to be equidistant thresholds between zero and maximum observed values # Also add the minimum possible value above zero as the first break in the sequence cbrks = seq(0, max(df[[values]], na.rm = TRUE), length.out = bins - 1) %>% .[2:(length(.)-1)] %>% c(.Machine$double.xmin,.) # Define names of custom breaks if (lgdps == 0) { # If set to show whole numbers # Round all values other than the first one (which is the minimum possible value above zero), up to the nearest whole number cbrks = c(.Machine$double.xmin, cbrks %>% .[2:length(.)] %>% ceiling()) # Assign break names between zero and the maximum observed value in the data brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ), cbrks %>% .[2:length(.)] %>% c(.,max_value), sep = "-")) } else if (norm_lgd) { # If data has been normalised # Assign break names between 0 and 1 to the specified number of decimal points (lgdps) brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,1 ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } else { # If using non-rounded, non-normalised data # Assign break names between zero and the maximum observed value in the data to the specified number of decimal points (lgdps) brk_nms = c(0, paste(cbrks %>% .[2:length(.)] %>% c(0 ,. ) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), cbrks %>% .[2:length(.)] %>% c(.,max_value) %>% sprintf(fmt = paste0('%#.',lgdps,'f')), sep = "-")) } # Create backup version of origin variable so rows can still be ordered by row total or row means if (sort_lower != "alphabetical") { df[[paste0(values,"_old")]] = df[[values]] } # Create discrete scale based on these custom breaks df[[values]] = df[[values]] %>% findInterval(cbrks) %>% replace(. == length(cbrks), (length(cbrks)-.Machine$double.xmin)) %>% {. + 1} %>% addNA() %>% factor(levels = 1:length(brk_nms), labels = brk_nms) } else { # Otherwise define consistent legend scale range lg_lims = df[[values]] %>% range(na.rm = TRUE) } # Define legend title (if not defined by user) if (is.null(lgttl) && norm_lgd) { lgttl = "Normalised\nValues" } else if (is.null(lgttl) && norm_lgd == F) { lgttl = "Values" } # Create empty list to populate with ggplot heatmaps pl = list() # Define vectors capturing the number of lower categories in each upper group yglns = df %>% group_split(!!rlang::sym(upper)) %>% purrr::map(~ .[[lower]] %>% unique() %>% length()) %>% unlist() # For each y-axis group for (ygrp in 1:length(ygrps)) { # Filter group-level migration data to only include origin and destination regions of interest sdf = df[df[[upper]] == ygrps[ygrp],] # If breaking data into categories, and sorting values by row values, then use continuous version of values to sort rows if (is.null(cbrks) && sort_lower != "alphabetical") { sort_var = values } else if (!is.null(cbrks) && sort_lower != "alphabetical") { sort_var = paste0(values,"_old") } # Define how rows of the lower categories should be arranged if (sort_lower == "alphabetical") { sdf[[lower]] = factor(sdf[[lower]], levels = sdf[[lower]] %>% unique() %>% sort() %>% rev() ) } else if (sort_lower == "sum_ascend") { sdf[[lower]] = factor(sdf[[lower]], levels = sdf %>% group_by(!!rlang::sym(lower)) %>% summarise(sum = sum(!!rlang::sym(sort_var), na.rm = TRUE)) %>% arrange(sum) %>% .[[lower]] %>% rev() ) } else if (sort_lower == "sum_descend") { sdf[[lower]] = factor(sdf[[lower]], levels = sdf %>% group_by(!!rlang::sym(lower)) %>% summarise(sum = sum(!!rlang::sym(sort_var), na.rm = TRUE)) %>% arrange(sum) %>% .[[lower]] ) } else if (sort_lower == "mean_ascend") { sdf[[lower]] = factor(sdf[[lower]], levels = sdf %>% group_by(!!rlang::sym(lower)) %>% summarise(mean = mean(!!rlang::sym(sort_var), na.rm = TRUE)) %>% arrange(mean) %>% .[[lower]] %>% rev() ) } else if (sort_lower == "mean_descend") { sdf[[lower]] = factor(sdf[[lower]], levels = sdf %>% group_by(!!rlang::sym(lower)) %>% summarise(mean = mean(!!rlang::sym(sort_var), na.rm = TRUE)) %>% arrange(mean) %>% .[[lower]] ) } else { stop("The variable sort_lower should be defined as one of the following: alphabetical, sum_ascend, sum_descend, mean_ascend, mean_descend. See `?tshm` for details.") } # Define main plot p = ggplot(sdf, aes(.data[[times]], .data[[lower]])) + geom_tile(aes(fill = .data[[values]]), show.legend = TRUE) + theme_minimal() + theme(plot.margin = unit(rep(0,4), "cm"), axis.title.y = element_text(angle = 0, hjust = 20.0, vjust = 0.5), axis.ticks = element_blank()) + #labs(y = ygrps[ygrp]) labs(x = NULL, y = NULL) # Define colour scale if (is.null(na_colour)) { if (!is.null(cbrks) && !is.null(cclrs)) { p = p + scale_fill_manual(name = lgttl, values = cclrs , drop = F, na.translate = FALSE) } else if (!is.null(cbrks) && is.null(cclrs)) { p = p + scale_fill_manual(name = lgttl, values = cg("grey90","#08306B",(length(cbrks)+1)), drop = F, na.translate = FALSE) } else if (is.null(cbrks) && !is.null(cclrs) && norm_lgd) { p = p + scale_fill_gradientn(name = lgttl, colours = cclrs , limits = c(0,1) ) } else if (is.null(cbrks) && !is.null(cclrs) && norm_lgd == F) { p = p + scale_fill_gradientn(name = lgttl, colours = cclrs , limits = lg_lims) } else if (is.null(cbrks) && is.null(cclrs) && norm_lgd) { p = p + scale_fill_gradientn(name = lgttl, colours = cg("grey90","#08306B"), limits = c(0,1) ) } else if (is.null(cbrks) && is.null(cclrs) && norm_lgd == F) { p = p + scale_fill_gradientn(name = lgttl, colours = cg("grey90","#08306B"), limits = lg_lims) } } else { if (!is.null(cbrks) && !is.null(cclrs)) { p = p + scale_fill_manual(name = lgttl, values = cclrs , drop = F, na.value = na_colour) } else if (!is.null(cbrks) && is.null(cclrs)) { p = p + scale_fill_manual(name = lgttl, values = cg("grey90","#08306B",(length(cbrks)+1)), drop = F, na.value = na_colour) } else if (is.null(cbrks) && !is.null(cclrs) && norm_lgd) { p = p + scale_fill_gradientn(name = lgttl, colours = cclrs , limits = c(0,1) , na.value = na_colour) } else if (is.null(cbrks) && !is.null(cclrs) && norm_lgd == F) { p = p + scale_fill_gradientn(name = lgttl, colours = cclrs , limits = lg_lims, na.value = na_colour) } else if (is.null(cbrks) && is.null(cclrs) && norm_lgd) { p = p + scale_fill_gradientn(name = lgttl, colours = cg("grey90","#08306B"), limits = c(0,1) , na.value = na_colour) } else if (is.null(cbrks) && is.null(cclrs) && norm_lgd == F) { p = p + scale_fill_gradientn(name = lgttl, colours = cg("grey90","#08306B"), limits = lg_lims, na.value = na_colour) } } # To avoid creating multiple legends, if the user has specified a colour for NA values then do not show any legends # for plots where no NA values occured. This assumed that at least one plot the multiplot contains NA values, which seems # reasonable, as why else would the user bother to specify the colour of NA values) if (!is.null(na_colour) && (sdf[[values]] %>% is.na() %>% sum() %>% {. == 0}) ) { p = p + theme(legend.position = "none") } # If not bottom plot, remove x-axis text and title if (ygrp != length(ygrps)) { p = p + theme(axis.title.x = element_blank(), axis.text.x = element_blank()) } # Add ggplot to plot list pl[[ygrp]] = p } # Define plot heights and widths (including group titles) wds = c(yttl_width,1) hts = c(yglns,(sum(yglns)*xttl_height)) # Define plot spacer ps = plot_spacer() # Create empty list to be populated with plot titles yttls = list() # Define plot titles for (ygrp in 1:length(ygrps)) { yttls[[ygrp]] = plt_ttl(ygrps[ygrp],axs="y") } # Create empty list to populate with both plot title and heatmap tiles in the correct order plts = list() # Define counters for subsetting plot title and heatmap lists, to ensure they are ordered correctly i = 1 j = 1 # For each group (row), assign each plot title, then the heatmap tiles within that row to plts list for (ygrp in 1:length(yttls)) { # Add plot title to list plts[[i]] = yttls[[ygrp]] # Add heatmap plots to list plts[i+1] = pl[j] # Adjust counters i = i + 2 j = j + 1 } # Add x-axis plots plts[[length(plts)+1]] = ps plts[[length(plts)+1]] = plt_ttl(times, rotate_title = FALSE) # Define final plot plt = patchwork::wrap_plots(plts, widths = wds, heights = hts, guides = "collect") # Return final plot return(plt) } ```
## Overview `hhmR` allows users to create high-quality heatmaps from labelled, hierarchical data. Specifically, for data with a two-level hierarchical structure, it will produce a heatmap where each row and column represents a category at the lower level. These rows and columns are then grouped by the higher-level group each category belongs to, with the names for each category and groups shown in the margins. While other packages (e.g. `dendextend`) allow heatmap rows and columns to be arranged by groups, I believe this is the only R package which also labels the data at both levels - i.e. both category and group names are shown along the left and bottom margins.
## Hierarchical Heatmaps (`hhm`) The main function within the package is `hhm`. This function is useful if you wish to create a heatmap where the categories shown on both the x and y axis can be grouped in some way. This heatmap will order the categories by their assigned group and present both the categories and group labels along the axes. To illustrate how this function can be used, we use internal migration data from a fake country (`example_migration`). This country is made up of regions, with each region containing multiple counties. This hierachical structure is summarised below: ```{r, warning = FALSE, message = FALSE} # Import dplyr for data cleaning library(dplyr) # Summarise hierarchical data structure example_migration %>% group_by(`Origin Region`) %>% reframe(`Origin County` = unique(`Origin County`)) %>% setNames(c("Region","County")) ``` This dataset contains information on the number of people that have moved between these counties / regions over a given period of time. It contains five columns: an `Origin County` (and the `Origin Region` it is within), a `Destination County` (and the `Destination Region` it is within), and the number of people who have migrated between the origin and destination counties during the observation period (`Migration`). ```{r} # Show data head(example_migration) ``` #### Initial Heatmap A useful way of visualising migration data is using a migration matrix or heatmap. However, as this data is hierarchical, it would also be useful to show how migrants have moved between regions within the same figure. This is where the `hhm` function can be useful. It can create a heatmap which shows the number of migrants that have moved between the different counties, with the rows and columns ordered and labelled by region. To run the function, `Origin County` and `Destination County` are provided as lower-level categories to be shown along the y and x axes respectively (i.e. `ylower` and `xlower`). Additionally, `Origin Region` and `Destination Region` are provided as upper-level groups via which the rows and columns are ordered (i.e. `yupper` and `xupper`). Finally, `Migration` provides the values with which to populate the heatmap, while `yttl_width` and `xttl_height` are used to ensure there is enough space for the region names to be shown correctly. ```{r, fig.width=7.15, fig.height=6.15} # Create Intial heatmap hierarchical_heatmap = hhm(df = example_migration, ylower = "Origin County", xlower = "Destination County", yupper = "Origin Region", xupper = "Destination Region", values = "Migration", yttl_width = 0.22, xttl_height = 0.22) # View result hierarchical_heatmap ``` #### Remove diagonal In the above example, the darkest colours are all shown along the diagonal. This is because, across a given period of time, the majority of the population does not migrate, meaning their origin and destination counties are the same. In order to focus on people who moved between different counties, we can remove the diagonal by setting `rm_diag` to `TRUE`. ```{r, fig.width=7.15, fig.height=6.15} # Remove diagonal from heatmap (i.e. hide static populations) removed_diag = hhm(df = example_migration, ylower = "Origin County", xlower = "Destination County", yupper = "Origin Region", xupper = "Destination Region", values = "Migration", yttl_width = 0.22, xttl_height = 0.22, rm_diag = TRUE) # View result removed_diag ``` #### Normalise legend Sometimes it can be useful to normalise the values within a heatmap, so different datasets can be compared using the same legend. This can be easily done by setting `norm_lgd` to `TRUE`. ```{r, fig.width=7.15, fig.height=6.15} # Nomalise the legend normalised_lgd = hhm(df = example_migration, ylower = "Origin County", xlower = "Destination County", yupper = "Origin Region", xupper = "Destination Region", values = "Migration", yttl_width = 0.22, xttl_height = 0.22, rm_diag = TRUE, norm_lgd = TRUE) # View result normalised_lgd ``` #### Manually define continuous colour scheme The default colour scheme fades from blue to white. However, the legend colour scheme can be define manually by passing a vector of hexcodes to the argument `cclrs`. In this example, the viridis colour scheme has been provided. ```{r, fig.width=7.15, fig.height=6.15} # Manually define colour scheme for heatmap (uses viridis colour scheme) viridis_12 = c("#440154FF","#482173FF","#433E85FF","#38598CFF","#2D708EFF","#25858EFF", "#1E9B8AFF","#2BB07FFF","#51C56AFF","#85D54AFF","#C2DF23FF","#FDE725FF") # Assign continuous colour scheme cont_clrs = hhm(df = example_migration, ylower = "Origin County", xlower = "Destination County", yupper = "Origin Region", xupper = "Destination Region", values = "Migration", yttl_width = 0.22, xttl_height = 0.22, rm_diag = TRUE, norm_lgd = TRUE, cclrs = viridis_12) # View result cont_clrs ``` #### Break data into equal interval bins The function also contains the option to break the data into a specified number categories, based on equal interval breaks between 0 and the maximum value within the dataset. This can be done by passing the number of desired categories to `bins`. ```{r, fig.width=7.15, fig.height=6.15} # Break legends into a specified number of bins # (of equal intervals between 0 and the maximum value in `values`) bins_15 = hhm(df = example_migration, ylower = "Origin County", xlower = "Destination County", yupper = "Origin Region", xupper = "Destination Region", values = "Migration", yttl_width = 0.22, xttl_height = 0.22, rm_diag = TRUE, bins = 15) # View result bins_15 ``` #### Manually define interval breaks At times it might be desirable to manually define the intervals breaks between categories. For example, if the data is highly skewed or normally distributed then equal interval breaks may be inappropriate. Manual breaks can be provided by passing a vector of intervals to `cbrks`. In this instance, the `hhmR` function `log_seq` has been used to create a vector of logarithmically increasing values between 1 and the maximum value in the dataset (not on the diagonal). ```{r} # Manually break data into categories using user-specified intervals. cbrks = log_seq(example_migration[example_migration[["Origin County" ]] != example_migration[["Destination County"]],] %>% .$Migration %>% max(), 12, rmv_extremes = TRUE) # Show interval breaks cbrks ``` ```{r, fig.width=7.15, fig.height=6.15} # Manually assign legend categories legend_cats = hhm(df = example_migration, ylower = "Origin County", xlower = "Destination County", yupper = "Origin Region", xupper = "Destination Region", values = "Migration", yttl_width = 0.22, xttl_height = 0.22, rm_diag = TRUE, cbrks = cbrks) # View result legend_cats ``` #### Manually assign categoric colours It is also possible to manually define the colour of each category by passing a vector of hexcodes to `cclrs`. The length of this vector must be two longer then the vector passed to `cbrks`. ```{r, fig.width=7.15, fig.height=6.15} # Manually assign colours to legend categories cat_clrs = hhm(df = example_migration, ylower = "Origin County", xlower = "Destination County", yupper = "Origin Region", xupper = "Destination Region", values = "Migration", yttl_width = 0.22, xttl_height = 0.22, rm_diag = TRUE, cbrks = cbrks, cclrs = viridis_12) # View result cat_clrs ```
## Time-series heatmaps (`tshhm`) Another common challenge with hierarchical data is visualising how it changes over time. The function `tshhm` is design to address this. This function is useful if you wish to create a time-series heatmap where the categories shown on the y axis can be grouped in some way. This heatmap will order the categories by their assigned group and present both the categories and group labels along the y-axis. To illustrate this, we use immigration data between 2011-2015 for the same fake country as the previous example (`example_time_series`). This country is made up of the same regions, with each region containing the same counties as shown. It contains four columns: a `County` (and the `Region` it is within), the `Year` of observation, and the number of immigrants a county recieved in a given year (`Immigration`). ```{r} # Show data head(example_time_series) ``` #### Initial Heatmap To run the function, the variable `County` is provided as the lower-level categories (`lower`) to be shown along the y-axis, while the variable `Region` is provided as the upper-level groups (`upper`) with which to order and group the heatmap rows. Additionally, `Year` is provided as the time intervals to be shown on along the axis (`times`), while `Immigration` provides the values with which to populate the heatmap (`values`). Finally, `yttl_width` and is used to ensure there is enough space for the region names to be shown correctly on the y-axis. Note that NA values are displayed as blank. ```{r, fig.width=7.15, fig.height=4.5} # Intial heatmap time_series_heatmap = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", yttl_width = 0.25) # View result time_series_heatmap ``` #### Sort rows in ascending order It can sometimes be useful to arrange rows in ascending or descending order, depending on the values within the heatmap. The parameter `sum_ascend` allows for this functionality by arranging the rows within in group. The default option is `alphabetical`, which orders rows in alphabetical order from top to bottom. Other options include `sum_ascend` and `mean_ascend`, which order rows in ascending order (top to bottom) based on the row totals and row means respectively. This order can be reversed with the options `sum_descend` and `mean_descend`. Manually defining the order of upper-level groups (`Region` in this example) is demonstrated at the end of the vignette. ```{r, fig.width=7.15, fig.height=4.5} # Arrange counties within each region by total number of immigrants # across all five years (ascending from top to bottom) sort_ascending = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", yttl_width = 0.25) # View result sort_ascending ``` #### Normalise legend Sometimes it can be useful to normalise the values within a heatmap, so different datasets can be compared using the same legend. This can be easily done by setting `norm_lgd` to `TRUE`. ```{r, fig.width=7.15, fig.height=4.5} # Nomalise the legend normalised_lgd = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", norm_lgd = TRUE, yttl_width = 0.25) # View result normalised_lgd ``` #### Manually define continuous colour scheme The legend colour scheme can be defined manually by passing a vector of hexcodes to the argument `cclrs`. In this example, the viridis colour scheme has been provided. ```{r, fig.width=7.15, fig.height=4.5} # Assign continuous colour scheme cont_clrs = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", norm_lgd = TRUE, cclrs = viridis_12, yttl_width = 0.25) # View result cont_clrs ``` #### Assign NA colours It can sometimes be useful to visually highlight cells where data is missing. This can be done by passing a hexcode to the argument `na_colour`. This will cause all NA values within the dataset to be displayed in the specified colour. ```{r, fig.width=7.15, fig.height=4.5} # Assign colour for NA values na_clrs = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", norm_lgd = TRUE, cclrs = viridis_12, na_colour = "grey80", yttl_width = 0.25) # View result na_clrs ``` #### Break data into equal interval bins The function also contains the option to break the data into a specified number categories, based on equal interval breaks between 0 and the maximum value within the dataset. This can be done by passing the number of desired categories to `bins`. ```{r, fig.width=7.15, fig.height=4.5} # Break legends into a specified number of bins # (of equal intervals between 0 and the maximum value in `values`) bins_15 = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", bins = 15, yttl_width = 0.25) # View result bins_15 ``` #### Manually define interval breaks At times it might be desirable to manually define the intervals breaks between categories. For example, if the data is highly skewed or normally distributed then equal interval breaks may be inappropriate. Manual breaks can be provided by passing a vector of intervals to `cbrks`. In this instance, the `hhmR` function `log_seq` has been used to create a vector of logarithmically increasing values between 1 and the maximum value in the dataset. ```{r} # Manually break data into categories using user-specified intervals. cbrks = log_seq(example_time_series %>% .$Immigration %>% max(na.rm = TRUE), 12, rmv_extremes = TRUE) # Show breaks cbrks ``` ```{r, fig.width=7.15, fig.height=4.5} # Manually assign legend categories legend_cats = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", cbrks = cbrks, yttl_width = 0.25) # View result legend_cats ``` #### Manually assign categoric colours It is also possible to manually define the colour of each category by passing a vector of hexcodes to `cclrs`. The length of this vector must be two longer then the vector passed to `cbrks`. ```{r, fig.width=7.15, fig.height=4.5} # Manually assign colours to legend categories cat_clrs = tshhm(df = example_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", cbrks = cbrks, cclrs = viridis_12, na_colour = "grey80", yttl_width = 0.25) # View result cat_clrs ``` #### Manually define the order of x-axis values and y-axis groups If the variable supplied to `times` is numeric, it will by default sort values in ascending order from left to right along the x-axis. However, this order can be manually changed by supplying a factor to `times`. To demonstrate this, we have supplied the variable `Year` as a factor with the years 2011-2015 ordered non-chronologically. Similarly, the groups supplied to `Upper` will by default be displayed alphabetically from top to bottom along the y-axis. However, this order can be manually changed by supplying a factor to `upper`. In the below example, we have supplied the variable `Region` with regions ordered from North-West to South-East. ```{r, fig.width=7.15, fig.height=4.5} # Manually define order of x-axis and groups using factor levels new_time_series = example_time_series %>% mutate(Year = factor(Year, levels = c(2012,2011,2014, 2013,2015)), Region = factor(Region, levels = c("North","Midlands", "South West", "South East"))) # Manually define order of x-axis and groups rearrange_axes = tshhm(df = new_time_series, lower = "County", upper = "Region", times = "Year", values = "Immigration", sort_lower = "sum_ascend", cbrks = cbrks, cclrs = viridis_12, na_colour = "grey80", yttl_width = 0.25) # View result rearrange_axes ```