To illustrate non-linear interaction effects, we will be working with the GEO dataset GSE80512. Briefly, this examines the effect of stressing yeast with heat or high salt concentration, with wildtype yeast and a yeast with a mutated histone in which post-translational phosphorylation at a certain site is prevented (thought to affect gene regulation). This data set was chosen simply because it is a good quality, publicly available data set which can be used to illustrate the interaction of two experimental factors.

library(topconfects)

library(reshape2)
library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)

library(edgeR)

counts <- 
    read.delim("GSE80512_DESeq2_raw_counts.txt.gz", row.names=1) %>% 
    as.matrix

# Retrieve symbol and description for each gene
info <- AnnotationDbi::select(
    org.Sc.sgd.db::org.Sc.sgd.db, 
    rownames(counts), c("GENENAME","DESCRIPTION")) 
all(rownames(counts) == info$ORF)
## [1] TRUE
info <- select(info, GENENAME, DESCRIPTION) %>% as.data.frame
rownames(info) <- rownames(counts)

# Experiment design
samples <- read_csv(
   "strain,stress,sample
    wt,none,Sample_1_13462_CGATGT
    wt,none,Sample_2_13463_TTAGGC
    wt,none,Sample_3_13464_GATCAG
    mutant,none,Sample_4_13465_CCGTCC
    mutant,none,Sample_5_13466_GTCCGC
    mutant,none,Sample_6_13467_ATTCCT
    wt,salt,Sample_7_13468_ATCACG
    wt,salt,Sample_8_13469_GCCAAT
    wt,salt,Sample_9_13470_ACTTGA
    mutant,salt,Sample_10_13471_TAGCTT
    mutant,salt,Sample_11_13472_GGCTAC
    mutant,salt,Sample_12_13473_ATGTCA
    wt,heat,Sample_13_13474_TGACCA
    wt,heat,Sample_14_13475_ACAGTG
    wt,heat,Sample_15_13476_CAGATC
    mutant,heat,Sample_16_13477_GTGGCC
    mutant,heat,Sample_17_13478_CGTACG
    mutant,heat,Sample_18_13479_GAGTGG") %>%
    mutate(
        strain=factor(strain,unique(strain)),
        stress=factor(stress,unique(stress)))
all(samples$sample == colnames(counts))
## [1] TRUE

edgeR fit

design <- model.matrix(~ 0 + strain:stress, data=samples)
colnames(design)
## [1] "strainwt:stressnone"     "strainmutant:stressnone"
## [3] "strainwt:stresssalt"     "strainmutant:stresssalt"
## [5] "strainwt:stressheat"     "strainmutant:stressheat"
y <- DGEList(counts, genes=info)
keep <- rowSums(cpm(y)>2) >= 3
table(keep)
## keep
## FALSE  TRUE 
##   961  5785
y <- y[keep, , keep.lib.sizes=FALSE]
y <- calcNormFactors(y) %>%
    estimateDisp(design)

fit <- glmQLFit(y, design)

fit$df.prior
## [1] 6.215418

Viewing stamp collections

With two possible main effects and an interaction effect, it will be important to view the raw data to understand what is actually going on.

group_rpms <- 
    (exp(fit$coefficients) * 1e6) %>%
    melt(c("gene","coef"), value.name="RPM") %>% 
    separate(coef, c("strain","stress")) %>%
    mutate(strain = factor(gsub("strain","",strain), levels=c("wt","mutant")),
           stress = factor(gsub("stress","",stress), levels=c("none","salt","heat")))

sample_rpms <- 
    cpm(y) %>%
    melt(c("gene","sample"), value.name="RPM") %>%
    left_join(samples,"sample")
## Warning: Column `sample` joining factor and character vector, coercing into
## character vector
show_stamps <- function(names, stressors, n=25) {
    names <- head(names, n)
    
    group_rpms %>% 
    filter(gene %in% names, stress %in% stressors) %>% 
    mutate(gene=factor(gene,names)) %>%
    ggplot(aes(x=stress,color=strain,group=strain,y=RPM)) + 
    geom_line() +
    geom_point(
        data=filter(sample_rpms, gene %in% names, stress %in% stressors), 
        alpha=0.333, size=3, stroke=0) +
    geom_hline(yintercept=0) + 
    facet_wrap(~ gene, scales="free_y")
}

First, let us look at a random sample of genes. Let’s just concentrate on the response to salt for now.

show_stamps(sample(rownames(y)), c("none", "salt"))

Many of the genes show differential expression due to the salt stress. However our primary interest is how this interacts with the histone mutation.

edgeR topTags

Let’s try to identify genes where the response to salt stress is different in the mutant.

This contrast on the log-linear model will search for genes where

(mutant_salt/mutant_none)/(wt_salt/wt_none) != 0

top <- fit %>%
    glmQLFTest(contrast=c(1,-1,-1,1,0,0)) %>%
    topTags(n=Inf)

table(top$table$FDR <= 0.05)
## 
## FALSE  TRUE 
##  5265   520
print(head(top$table, 10), right=FALSE)
##         GENENAME
## YBR021W FUR4    
## YGR043C NQM1    
## YHR096C HXT5    
## YJL033W HCA4    
## YBR072W HSP26   
## YOL124C TRM11   
## YDR070C FMP16   
## YHR065C RRP3    
## YMR011W HXT2    
## YMR169C ALD3    
##         DESCRIPTION                                                                                                                                                                                                                                                                                                                               
## YBR021W Plasma membrane localized uracil permease; expression is tightly regulated by uracil levels and environmental cues; conformational alterations induced by unfolding or substrate binding result in Rsp5p-mediated ubiquitination and degradation                                                                                          
## YGR043C Transaldolase of unknown function; transcription is repressed by Mot1p and induced by alpha-factor and during diauxic shift; NQM1 has a paralog, TAL1, that arose from the whole genome duplication                                                                                                                                       
## YHR096C Hexose transporter with moderate affinity for glucose; induced in the presence of non-fermentable carbon sources, induced by a decrease in growth rate, contains an extended N-terminal domain relative to other HXTs; HXT5 has a paralog, HXT3, that arose from the whole genome duplication                                             
## YJL033W DEAD box RNA helicase; component of the SSU; interacts with Bfr2p and Enp2p; high-copy number suppression of a U14 snoRNA processing mutant suggests an involvement in 18S rRNA synthesis                                                                                                                                                 
## YBR072W Small heat shock protein (sHSP) with chaperone activity; forms hollow, sphere-shaped oligomers that suppress unfolded proteins aggregation; long-lived protein that is preferentially retained in mother cells and forms cytoplasmic foci; oligomer activation requires heat-induced conformational change; also has mRNA binding activity
## YOL124C Catalytic subunit of adoMet-dependent tRNA methyltransferase complex; required for the methylation of the guanosine nucleotide at position 10 (m2G10) in tRNAs; contains a THUMP domain and a methyltransferase domain; another complex member is Trm112p                                                                                 
## YDR070C Protein of unknown function; may be involved in responding to conditions of stress; the authentic, non-tagged protein is detected in highly purified mitochondria in high-throughput studies; protein abundance increases in response to DNA replication stress                                                                           
## YHR065C Protein involved in rRNA processing; required for maturation of the 35S primary transcript of pre-rRNA and for cleavage leading to mature 18S rRNA; homologous to eIF-4a, which is a DEAD box RNA-dependent ATPase with helicase activity                                                                                                 
## YMR011W High-affinity glucose transporter of the major facilitator superfamily; expression is induced by low levels of glucose and repressed by high levels of glucose                                                                                                                                                                            
## YMR169C Cytoplasmic aldehyde dehydrogenase; involved in beta-alanine synthesis; uses NAD+ as the preferred coenzyme; very similar to Ald2p; expression is induced by stress and repressed by glucose                                                                                                                                              
##         logFC     logCPM    F        PValue       FDR         
## YBR021W  1.822642  6.299541 224.8724 1.076038e-11 6.224881e-08
## YGR043C -1.465859  9.097715 172.7814 9.774167e-11 2.827178e-07
## YHR096C -1.818737 10.228356 149.9831 3.130767e-10 6.037162e-07
## YJL033W  1.491774  4.607822 134.9086 7.402521e-10 9.885850e-07
## YBR072W -1.499280 11.969615 132.5314 8.544382e-10 9.885850e-07
## YOL124C  1.271785  4.230186 124.3371 1.426553e-09 1.375434e-06
## YDR070C -1.280466  8.014308 117.7001 2.209317e-09 1.730566e-06
## YHR065C  1.185455  4.371145 115.7508 2.522508e-09 1.730566e-06
## YMR011W  1.524570 10.194933 114.8032 2.692324e-09 1.730566e-06
## YMR169C -1.054112 11.253244 111.5375 3.382360e-09 1.871477e-06

We have significance to spare, but:

show_stamps(rownames(top$table), c("none", "salt"))

The top results by p-value are rather hit-and-miss in terms of a substantial interaction effect.

Note: Taking a leaf out of Di Cook’s playbook, with 520 significant results it would not take long to manually examine all of them. This would only require examining 21 figures such as the one shown above.

Linear confects

We now instead will search for genes where

log2( (mutant_salt/mutant_none)/(wt_salt/wt_none) ) has maximum magnitude.

linear <- edger_confects(fit, contrast=c(1,-1,-1,1,0,0))

linear
## $table
##  rank index confect effect    logCPM    name    GENENAME
##   1    850   1.12    1.822642  6.299541 YBR021W FUR4    
##   2   5210  -1.01   -1.818737 10.228356 YHR096C HXT5    
##   3   3494  -0.88   -1.465859  9.097715 YGR043C NQM1    
##   4   1308   0.87    2.359035  3.965554 YDL063C SYO1    
##   5    900  -0.85   -1.499280 11.969615 YBR072W HSP26   
##   6    547   0.85    1.491774  4.607822 YJL033W HCA4    
##   7   5477   0.83    1.524570 10.194933 YMR011W HXT2    
##   8   2957   0.83    1.755079  4.009971 YOR294W RRS1    
##   9   3063   0.81    1.642126  4.850232 YCL054W SPB1    
##  10   3516   0.77    1.666219  4.068022 YGR067C <NA>    
##  DESCRIPTION                                                                                                                                                                                                                                                                                                                                                                                                       
##  Plasma membrane localized uracil permease; expression is tightly regulated by uracil levels and environmental cues; conformational alterations induced by unfolding or substrate binding result in Rsp5p-mediated ubiquitination and degradation                                                                                                                                                                  
##  Hexose transporter with moderate affinity for glucose; induced in the presence of non-fermentable carbon sources, induced by a decrease in growth rate, contains an extended N-terminal domain relative to other HXTs; HXT5 has a paralog, HXT3, that arose from the whole genome duplication                                                                                                                     
##  Transaldolase of unknown function; transcription is repressed by Mot1p and induced by alpha-factor and during diauxic shift; NQM1 has a paralog, TAL1, that arose from the whole genome duplication                                                                                                                                                                                                               
##  Transport adaptor or symportin; facilitates synchronized nuclear coimport of the two 5S-rRNA binding proteins Rpl5p and Rpl11p; binds to nascent Rpl5p during translation; required for biogenesis of the large ribosomal subunit; green fluorescent protein (GFP)-fusion protein localizes to the cytoplasm and nucleus                                                                                          
##  Small heat shock protein (sHSP) with chaperone activity; forms hollow, sphere-shaped oligomers that suppress unfolded proteins aggregation; long-lived protein that is preferentially retained in mother cells and forms cytoplasmic foci; oligomer activation requires heat-induced conformational change; also has mRNA binding activity                                                                        
##  DEAD box RNA helicase; component of the SSU; interacts with Bfr2p and Enp2p; high-copy number suppression of a U14 snoRNA processing mutant suggests an involvement in 18S rRNA synthesis                                                                                                                                                                                                                         
##  High-affinity glucose transporter of the major facilitator superfamily; expression is induced by low levels of glucose and repressed by high levels of glucose                                                                                                                                                                                                                                                    
##  Essential protein that binds ribosomal protein L11; required for nuclear export of the 60S pre-ribosomal subunit during ribosome biogenesis; localizes to the nucleolus and in foci along nuclear periphery; cooperates with Ebp2p and Mps3p to mediate telomere clustering by binding Sir4p, but is not involved in telomere tethering; mouse homolog shows altered expression in Huntington's disease model mice
##  AdoMet-dependent methyltransferase; involved in rRNA processing and 60S ribosomal subunit maturation; methylates G2922 in the tRNA docking site of the large subunit rRNA and in the absence of snR52, U2921; suppressor of PAB1 mutants                                                                                                                                                                          
##  Putative protein of unknown function; contains a zinc finger motif similar to that of Adr1p                                                                                                                                                                                                                                                                                                                       
## ...
## 520 of 5785 non-zero log2 fold change at FDR 0.05
## Prior df 6.2
## Dispersion 0.0069 to 0.021
## Biological CV 8.3% to 14.7%

show_stamps(linear$table$name, c("none", "salt"))

This is no great improvement. Can you see the problem? A difference in ratios between when the gene is essentially off and when it is on is not interesting. The fact of the gene turning on is the story – but not the story we are interested in!

Non-linear confects

Let us now seek out genes where

salt_mutant/(none_mutant+salt_mutant) - salt_wt/(none_wt+salt_wt) has maximum magnitude.

The response to salt in the mutant, compared to the wildtype.

That is, a “difference of proportions”.

# This creates an object specifying the non-linear effect we are interested in.
# topconfects has several such effect_... functions, and more can easily be added.
# The _log2 is because edgeR has a log link function,
#    and coefficients need to be transformed back to a linear scale.
effect <- effect_shift_log2(c(1,3),c(2,4))

nonlinear <- edger_confects(fit, effect=effect)

nonlinear
## $table
##  rank index confect effect     logCPM    name      GENENAME
##   1    850   0.18    0.2927730  6.299541 YBR021W   FUR4    
##   2   5477   0.12    0.2525727 10.194933 YMR011W   HXT2    
##   3    469   0.10    0.2196894  5.619169 YJL116C   NCA3    
##   4   1692   0.08    0.1715141  9.611573 YDR345C   HXT3    
##   5   5208   0.07    0.1596600  8.429903 YHR092C   HXT4    
##   6    884  -0.07   -0.2036980  4.193461 YBR056W-A <NA>    
##   7   3516   0.07    0.1677288  4.068022 YGR067C   <NA>    
##   8   3224   0.06    0.1456658  9.384778 YGL255W   ZRT1    
##   9   3735   0.05    0.1208114  3.753062 YLL061W   MMP1    
##  10   1888   0.05    0.1455426  8.997924 YIL162W   SUC2    
##  DESCRIPTION                                                                                                                                                                                                                                                                                                                                                                                                          
##  Plasma membrane localized uracil permease; expression is tightly regulated by uracil levels and environmental cues; conformational alterations induced by unfolding or substrate binding result in Rsp5p-mediated ubiquitination and degradation                                                                                                                                                                     
##  High-affinity glucose transporter of the major facilitator superfamily; expression is induced by low levels of glucose and repressed by high levels of glucose                                                                                                                                                                                                                                                       
##  Protein involved in mitochondrion organization; functions with Nca2p to regulate mitochondrial expression of subunits 6 (Atp6p) and 8 (Atp8p) of the Fo-F1 ATP synthase; SWAT-GFP, seamless-GFP and mCherry fusion proteins localize to the vacuole; member of the SUN family; expression induced in cells treated with the mycotoxin patulin; NCA3 has a paralog, UTH1, that arose from the whole genome duplication
##  Low affinity glucose transporter of the major facilitator superfamily; expression is induced in low or high glucose conditions; HXT3 has a paralog, HXT5, that arose from the whole genome duplication                                                                                                                                                                                                               
##  High-affinity glucose transporter; member of the major facilitator superfamily, expression is induced by low levels of glucose and repressed by high levels of glucose; HXT4 has a paralog, HXT7, that arose from the whole genome duplication                                                                                                                                                                       
##  Protein of unknown function; mRNA identified as translated by ribosome profiling data; partially overlaps dubious ORF YBR056C-B; YBR056W-A has a paralog, YDR034W-B, that arose from the whole genome duplication                                                                                                                                                                                                    
##  Putative protein of unknown function; contains a zinc finger motif similar to that of Adr1p                                                                                                                                                                                                                                                                                                                          
##  High-affinity zinc transporter of the plasma membrane; responsible for the majority of zinc uptake; transcription is induced under low-zinc conditions by the Zap1p transcription factor                                                                                                                                                                                                                             
##  High-affinity S-methylmethionine permease; required for utilization of S-methylmethionine as a sulfur source; has similarity to S-adenosylmethionine permease Sam3p                                                                                                                                                                                                                                                  
##  Invertase; sucrose hydrolyzing enzyme; a secreted, glycosylated form is regulated by glucose repression, and an intracellular, nonglycosylated enzyme is produced constitutively                                                                                                                                                                                                                                     
## ...
## 529 of 5785 non-zero effect size at FDR 0.05
## Prior df 6.2
## Dispersion 0.0069 to 0.021
## Biological CV 8.3% to 14.7%
confects_plot(nonlinear)

show_stamps(nonlinear$table$name, c("none", "salt"))

Look at all these wonderful interactions that were hiding in the data!

Appendix: Lack of symmetry

The non-linear effect size we examined is not symmetric in the two experimental factors. We might instead seek genes where

salt_mutant/(salt_wt+salt_mutant) - none_mutant/(none_wt+none_mutant) has maximum magnitude.

The response to the mutation in salt, compared to neutral conditions.

effect2 <- effect_shift_log2(c(1,2),c(3,4))
nonlinear2 <- edger_confects(fit, effect=effect2)
show_stamps(nonlinear2$table$name, c("none", "salt"))

The top ranked genes are different. Here they are not as interesting.


sessionInfo()
## R version 3.4.2 (2017-09-28)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] bindrcpp_0.2      edgeR_3.20.1      limma_3.34.0      readr_1.1.1      
## [5] ggplot2_2.2.1     tidyr_0.7.2       dplyr_0.7.4       reshape2_1.4.2   
## [9] topconfects_0.0.1
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.13         nloptr_1.0.4         org.Sc.sgd.db_3.4.2 
##  [4] compiler_3.4.2       plyr_1.8.4           bindr_0.1           
##  [7] tools_3.4.2          bit_1.1-12           digest_0.6.12       
## [10] RSQLite_2.0          evaluate_0.10.1      memoise_1.1.0       
## [13] tibble_1.3.4         gtable_0.2.0         lattice_0.20-35     
## [16] pkgconfig_2.0.1      rlang_0.1.4          DBI_0.7             
## [19] parallel_3.4.2       yaml_2.1.14          stringr_1.2.0       
## [22] knitr_1.17           IRanges_2.12.0       S4Vectors_0.16.0    
## [25] hms_0.3              tidyselect_0.2.2     bit64_0.9-7         
## [28] stats4_3.4.2         locfit_1.5-9.1       rprojroot_1.2       
## [31] grid_3.4.2           Biobase_2.38.0       glue_1.2.0          
## [34] R6_2.2.2             AnnotationDbi_1.40.0 rmarkdown_1.6       
## [37] blob_1.1.0           purrr_0.2.4          magrittr_1.5        
## [40] splines_3.4.2        BiocGenerics_0.24.0  backports_1.1.1     
## [43] scales_0.5.0         htmltools_0.3.6      assertthat_0.2.0    
## [46] colorspace_1.3-2     labeling_0.3         stringi_1.1.5       
## [49] lazyeval_0.2.1       munsell_0.4.3