The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.
Code for reproducing the results shown in the manuscript.
### Loading functions. ###
inst <- rownames(utils::installed.packages())
cran <- c("devtools","R.utils","Matrix","glmnet","pROC","BiocManager","ashr")
# "googledrive", "httpuv"
if(!all(cran %in% inst)){
for(i in seq_along(cran)){
if(!cran[i] %in% inst){
install.packages(cran[i])
}
}
}
bioc <- c("edgeR","TCGAbiolinks")
if(!all(bioc %in% inst)){
#source("http://bioconductor.org/biocLite.R")
for(i in seq_along(bioc)){
if(!bioc[i] %in% inst){
#biocLite(bioc[i])
BiocManager::install(bioc[i])
}
}
}
#if(!"ashr" %in% inst){
# devtools::install_github("stephens999/ashr")
#}
user <- Sys.getenv("USERNAME")
path <- file.path("C:","Users",user,"Desktop","palasso")
if(user=="arra"){path <- "C:/Users/arra/Desktop/MATHS/palasso_desktop"}
if(user==""){path <- "/virdir/Scratch/arauschenberger/palasso"}
setwd(path)
folders <- c("data","results")
invisible(sapply(folders,function(x) if(!dir.exists(x)){dir.create(x)}))
if(user!="arra"){
devtools::install_github("kkdey/CorShrink") # ref="a9f6ba0"
devtools::install_github("rauschenberger/palasso") # ref="4a995a2"
}
if(FALSE){
# The functions <<save>>, <<file.exists>> and <<file.remove>> access the hard disk, but also try to access googledrive.
save <- function(object,file){
base::save(object,file=file)
tryCatch(expr=googledrive::drive_upload(media=file,path=file),
error=function(x) NULL)
#Sys.sleep(0.5)
}
file.exists <- function(file){
offline <- base::file.exists(file)
online <- FALSE
if(!offline){
d <- googledrive::as_dribble(x=file)
online <- tryCatch(expr=googledrive::some_files(d),
error=function(x) FALSE)
#Sys.sleep(0.5)
}
return(offline|online)
}
file.remove <- function(file){
base::file.remove(file)
tryCatch(expr=googledrive::drive_trash(file=file),
error=function(x) NULL)
#Sys.sleep(0.5)
}
# The function <<update>> moves files from the research cloud to the remote drive. Given both paths, it first verifies whether the folders SIM, GDC and CCA are available, and then copies all missing files from the research cloud to the remote drive.
# from: path to the origin
# to: path to the destination
update <- function(from,to){
dir <- c("SIM","GDC","CCA")
if(any(!dir.exists(file.path(from,dir)))){stop("Invalid.")}
if(any(!dir.exists(file.path(to,dir)))){stop("Invalid.")}
pb <- utils::txtProgressBar(min=0,max=1,style=3)
for(i in seq_along(dir)){
files0 <- dir(file.path(from,dir[i]))
files1 <- dir(file.path(to,dir[i]))
names <- files0[!files0 %in% files1]
for(j in seq_along(names)){
utils::setTxtProgressBar(pb=pb,value=(i-1)/3+(j*i)/(3*length(names)))
file.copy(from=file.path(from,dir[i],names[j]),
to=file.path(to,dir[i],names[j]),
copy.date=TRUE)
}
utils::setTxtProgressBar(pb=pb,value=i/3)
}
}
# update(from="results",to="//tsclient/N/palasso/results")
}
Last data download started on 2019-03-26 (R version 3.5.3).
### Downloading "Isoform Expression Quantification". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
# Retrieving cancer types:
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
# Downloading isoform expression data:
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Transcriptome Profiling",
data.type="Isoform Expression Quantification")
TCGAbiolinks::GDCdownload(query,method="api",directory=directory)
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory)
X[[i]][,c("miRNA_ID","reads_per_million_miRNA_mapped",
"cross-mapped","miRNA_region")] <- NULL
y[[i]] <- rep(project[i],times=length(unique(X[[i]]$barcode)))
}
save(list=c("y","X"),file=file.path(path,"data","isoform_raw.RData"))
load(file.path(path,"data","isoform_raw.RData"),verbose=TRUE)
# Merging isoform expression data:
Xs <- do.call(what=rbind,args=X) # sparse matrix
y <- do.call(what="c",args=y)
# Transform to matrix
Xs$isoform_coords <- gsub(pattern="hg38:chr",replacement="",x=Xs$isoform_coords)
samples <- unique(Xs$barcode)
covariates <- unique(Xs$isoform_coords)
row <- match(Xs$barcode,samples)
col <- match(Xs$isoform_coords,covariates)
X <- Matrix::sparseMatrix(i=row,j=col,x=Xs$read_count,dimnames=list(samples,covariates))
# Order by molecular location
split <- strsplit(x=colnames(X),split=":|-")
chr <- sapply(split,function(x) x[[1]])
pos <- sapply(split,function(x) x[[2]])
order <- order(chr,pos)
X <- X[,order]
if(FALSE){ # testing
i <- sample(seq_len(nrow(Xs)),size=1)
Xs$read_count[i]
X[Xs$barcode[i],Xs$isoform_coords[i]]
}
save(list=c("y","X"),file=file.path(path,"data","isoform_all.RData"))
### Downloading "miRNA Expression Quantification". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
# Downloading data.
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Transcriptome Profiling",
data.type="miRNA Expression Quantification")
TCGAbiolinks::GDCdownload(query,method="api",directory=directory)
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
data <- TCGAbiolinks::GDCprepare(query,directory=directory)
X[[i]] <- t(data[,c(seq(from=2,to=ncol(data),by=3))])
y[[i]] <- rep(project[i],times=nrow(X[[i]]))
}
save(list=c("y","X"),file=file.path(path,"data","miRNA_raw.RData"))
load(file.path(path,"data","miRNA_raw.RData"))
X <- do.call(what=rbind,args=X)
y <- do.call(what="c",args=y)
rownames(X) <- gsub(pattern="read_count_",replacement="",x=rownames(X))
save(list=c("y","X"),file=file.path(path,"data","miRNA_all.RData"))
### Downloading "Gene Expression Quantification". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
# Retrieving cancer types:
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
# Downloading data:
memory.limit(size=16000) # Activate virtual memory in system control!
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Transcriptome Profiling",
data.type="Gene Expression Quantification",
workflow.type="HTSeq - Counts"); gc()
TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory); gc()
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE)); gc()
X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory); gc()
y[[i]] <- rep(project[i],times=ncol(X[[i]])); gc()
}
save(list=c("y","X"),file=file.path(path,"data","gene_raw.RData"))
load(file.path(path,"data","gene_raw.RData"))
genes <- SummarizedExperiment::rowData(X[[1]])
mart <- biomaRt::useMart("ensembl",dataset="hsapiens_gene_ensembl") #
char <- biomaRt::getBM(attributes=c("ensembl_gene_id","chromosome_name","transcript_start","gene_biotype"),filters=c("biotype","chromosome_name"),values=list("protein_coding",c(1:22,"X")),mart=mart)
select <- genes$ensembl_gene_id[genes$ensembl_gene_id %in% char$ensembl_gene_id]
X <- lapply(X,function(x) t(SummarizedExperiment::assays(x)$"HTSeq - Counts"[select,]))
X <- do.call(what=rbind,args=X)
y <- do.call(what="c",args=y)
save(list=c("y","X"),file=file.path(path,"data","gene_all.RData"))
### Downloading "Copy Number Variation". ###
#rm(list=ls())
#<<functions>>
directory <- file.path(path,"data")
setwd(directory)
project <- TCGAbiolinks::getGDCprojects()$id
project <- project[grepl(x=project,pattern="TCGA")]
y <- X <- list()
for(i in seq_along(project)){
query <- TCGAbiolinks::GDCquery(project=project[i],
data.category="Copy Number Variation",
data.type="Masked Copy Number Segment")
TCGAbiolinks::GDCdownload(query=query,method="api",directory=directory)
trace(TCGAbiolinks:::readTranscriptomeProfiling,tracer=quote(ignore.case<-TRUE))
X[[i]] <- TCGAbiolinks::GDCprepare(query,directory=directory)
y[[i]] <- rep(project[i],times=length(unique(X[[i]]$Sample))) # correct?
}
save(list=c("y","X"),file=file.path(path,"data","CNV_raw.RData"))
load(file.path(path,"data","CNV_raw.RData"),verbose=TRUE)
# Merging CNV data:
Xs <- do.call(what=rbind,args=X) # sparse matrix
y <- do.call(what="c",args=y)
#table(Xs$Sample)
# Prepare cutting.
cut <- list()
cut$chr <- c(1:22,"X")
cut$start <- sapply(cut$chr,function(x) min(Xs$Start[Xs$Chromosome==x]))
cut$end <- sapply(cut$chr,function(x) max(Xs$End[Xs$Chromosome==x]))
cut$length <- cut$end-cut$start
cut$dist <- sum(cut$length)/10000
cut$num <- round(cut$length/cut$dist)
# Create covariates.
cov <- list()
cov$p <- sum(cut$num)
cov$chromosome <- unlist(sapply(cut$chr,function(i) rep(i,times=cut$num[i])))
cov$location <- unlist(sapply(cut$chr,function(i) round(seq(from=cut$start[i],to=cut$end[i],length.out=cut$num[i]))))
cov$name <- paste0(cov$chromosome,":",cov$location)
# Create indices for each covariate.
index <- rep(list(integer()),times=cov$p)
pb <- utils::txtProgressBar(min=0,max=cov$p,style=3)
for(j in seq_len(cov$p)){
utils::setTxtProgressBar(pb=pb,value=j)
index[[j]] <- which((Xs$Chromosome==cov$chromosome[j]) &
(Xs$Start<=cov$location[j]) & (cov$location[j]<=Xs$End)) # consider <
}
# Expand indices to matrix.
X <- matrix(0,nrow=length(unique(Xs$Sample)),ncol=cov$p,
dimnames=list(unique(Xs$Sample),cov$name))
for(j in seq_along(index)){
mean <- Xs$Segment_Mean[index[[j]]]
i <- Xs$Sample[index[[j]]]
X[i,j] <- mean
}
if(FALSE){ # test
sample <- sample(rownames(X),size=1)
covariate <- sample(colnames(X),size=1)
split <- strsplit(covariate,split=":")[[1]]
a <- X[sample,covariate]
b <- Xs$Segment_Mean[(Xs$Sample==sample) & (Xs$Chromosome==split[1]) & (Xs$Start<=as.numeric(split[2])) & (as.numeric(split[2]) < Xs$End)]
all(a==b)
}
save(list=c("y","X","index"),file=file.path(path,"data","CNV_all.RData"))
### Extracting samples of interest. ###
#rm(list=ls())
#<<functions>>
type <- c("isoform","miRNA","CNV","gene")
for(i in seq_along(type)){
cat(type[i],"\n")
load(file.path(path,"data",paste0(type[i],"_all.RData")),verbose=TRUE)
# TCGA barcode
barcode <- rownames(X)
code <- sapply(barcode,function(x) strsplit(x,split="-"))
code <- as.data.frame(do.call(what=rbind,args=code))
colnames(code) <- c("project","TSS","participant","sample_vial",
"portion_analyte","plate","center")
code$sample <- substr(code$sample_vial,start=1,stop=2)
code$vial <- substr(code$sample_vial,start=3,stop=3)
code$portion <- substr(code$portion_analyte,start=1,stop=2)
code$analyte <- substr(code$portion_analyte,start=3,stop=3)
code$sample_vial <- code$portion_analyte <- NULL
# solid tumour (except blood for LAML)
solid <- (code$sample=="01" | (y=="TCGA-LAML" & code$sample=="03"))
X <- X[solid,]
y <- y[solid]
# unique samples
unique <- !duplicated(substr(rownames(X),start=1,stop=12))
X <- X[unique,]
y <- y[unique]
save(list=c("y","X"),file=file.path(path,"data",paste0(type[i],"_sub.RData")))
}
# isoform: n=9'794, p=194'595, k=32
# miRNA: n=9'794, p=1'881, k=32
# gene: n=9'830, p=19'602, k=33
# CNV: n=10'578, p=10'000, k=33
## Understanding barcodes:
# overview: https://wiki.nci.nih.gov/display/TCGA/TCGA+barcode
# details: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables
## Understanding replicate samples:
# http://gdac.broadinstitute.org/runs/sampleReports/latest/READ_Replicate_Samples.html
Last data analysis started on 2019-04-12 (R version 3.5.3).
### Analysing the TCGA data. ###
#rm(list=ls())
#<<functions>>
for(type in c("miRNA","isoform","CNV","gene")){
library(Matrix)
for(k in c(207,sample(528))){
set.seed(k); cat(k," ")
if(type %in% c("isoform","miRNA") & k > 496){next}
if(type %in% c("CNV","gene") & k > 528){next}
# searching for missing cancer-cancer combinations
rm(list=setdiff(ls(),c("k","type","path","save","file.exists","file.remove"))); gc()
file0 <- paste0("results/",type,"_start_",k,".RData")
file1 <- paste0("results/",type,"_loss_",k,".RData")
if(file.exists(file0)||file.exists(file1)){next}
save(object=k,file=file0)
load(paste0("data/",type,"_sub.RData"))
# indicating the cancer-cancer combination
cancer <- substring(text=unique(y),first=6)
comb <- utils::combn(x=cancer,m=2)
select <- paste0("TCGA-",comb[,k])
y <- ifelse(y==select[1],1,ifelse(y==select[2],0,NA))
rm(cancer,select)
# removing other cancer types
cond <- !is.na(y)
y <- y[cond]
X <- X[cond,]
rm(cond)
# pre-processing
if(type %in% c("isoform","miRNA")){
x <- palasso:::.prepare(X,cutoff="zero")
} else if(type=="gene"){
x <- palasso:::.prepare(X,cutoff="knee")
} else if(type=="CNV"){
x <- list(X=X,Z=sign(X))
x <- lapply(x,function(x) scale(x))
attributes(x)$info <- data.frame(n=nrow(X),p=ncol(X),prop=mean(x$Z==0))
}
rm(X)
# cross-validation
loss <- tryCatch(expr=palasso:::.predict(y=y,X=x,nfolds.int=10),error=function(e) palasso:::.predict(y=y,X=x,nfolds.int=10))
# information
loss$info <- cbind(k=k,
y0=comb[2,k],
y1=comb[1,k],
n0=sum(y==0),
n1=sum(y==1),
attributes(x)$info,
loss$info)
# refit
object <- palasso::palasso(y=y,X=x,nfolds=10,family="binomial",standard=TRUE,elastic=TRUE,shrink=TRUE)
model <- c(names(object),"elastic",
paste0("paired.",c("adaptive","standard","combined")))
for(max in c(10,50,Inf)){
temp <- list()
temp$nzero <- data.frame(model=model,x=NA,z=NA)
for(i in seq_along(model)){
coef <- palasso:::coef.palasso(object=object,model=model[i],max=max)
temp$nzero$x[i] <- sum(coef$x!=0)
temp$nzero$z[i] <- sum(coef$z!=0)
}
temp$select <- palasso:::subset.palasso(x=object,max=max,
model="paired.adaptive")$palasso$select
temp$weights <- palasso:::weights.palasso(object=object,max=max,
model="paired.adaptive")
temp$coef <- palasso:::coef.palasso(object=object,max=max,
model="paired.adaptive")
loss[[paste0("fit",max)]] <- temp
}
save(object=loss,file=file1)
file.remove(file0)
}
index <- sum(grepl(dir(),pattern="sessionInfo"))
sink(paste0("sessionInfo",index+1,".txt"))
date()
utils::sessionInfo()
devtools::session_info()
sink()
}
# The function <<collect>> loads all files from PATH including PATTERN in the file name, loads OBJECT into a list, and executes a function call.
#<<functions>>
# path: folder
# pattern: character, or NULL (all files)
# object: character vector, or NULL (all objects)
# what: function call
collect <- function(path=getwd(),pattern="",object=NULL,what="rbind"){
OBJECT = object
# identify files
files <- dir(path)
files <- files[grepl(x=files,pattern=pattern)]
files <- files[grepl(x=files,pattern=".RData")]
number <- gsub(pattern=paste0(pattern,"|.RData"),replacement="",x=files)
files <- files[order(as.numeric(number))] # trial1
names <- gsub(pattern=".RData",replacement="",x=files) # trial1
if(length(files)==0){stop("Invalid datasets.")}
# load data
all <- list()
for(i in seq_along(files)){
x <- load(file.path(path,files[i]))
x <- eval(parse(text=x))
if(is.null(OBJECT)){
all[[i]] <- x
names(all)[i] <- names[i]
} else {
for(j in seq_along(OBJECT)){
all[[OBJECT[j]]][[i]] <- x[[OBJECT[j]]]
names(all[[OBJECT[j]]])[i] <- names[i]
}
}
}
# fuse data
if(is.null(OBJECT)){
all <- do.call(what=what,args=all)
} else {
all <- lapply(all,function(x) do.call(what=what,args=x))
}
return(all)
}
LOSS <- list()
type <- c("gene","isoform","miRNA","CNV")
#type <- "miRNA"
for(i in seq_along(type)){
LOSS[[type[i]]] <- collect(path="results",
pattern=paste0(type[i],"_loss_"),
object=c("deviance","auc","class","info",
paste0("fit",c(10,50,Inf))))
}
for(i in seq_along(LOSS)){
for(j in 1:3){
colnames(LOSS[[i]][[j]])[colnames(LOSS[[i]][[j]])=="paired.adaptive"] <- "paired"
}
}
#type <- "gene"
#a <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","paired"]
#b <- LOSS[[type]]$deviance[rownames(LOSS[[type]]$deviance)=="10","elastic"]
#mean(a<b)
### Testing for significant differences. ###
#rm(list=ls())
#<<functions>>
#<<collect>>
row <- c("gene","isoform","miRNA","CNV")
col <- c("10","Inf")
lay <- c("standard_x","standard_z","standard_xz",
"adaptive_x","adaptive_z","adaptive_xz",
"elastic") # added "elastic"
M <- array(NA,dim=c(length(row),length(col),length(lay)),dimnames=list(row,col,lay))
for(i in seq_along(row)){
loss <- LOSS[[row[i]]][c("info","deviance")]
y0 <- as.character(loss$info$y0)
y1 <- as.character(loss$info$y1)
cancer <- sort(unique(c(y0,y1)))
Z <- palasso:::.design(x=cancer)
for(j in seq_along(col)){
# differences
cond <- rownames(loss$deviance)==col[j]
for(k in seq_along(lay)){
fill <- loss$deviance[cond,lay[k]] - loss$deviance[cond,"paired"]
X <- matrix(NA,nrow=length(cancer),ncol=length(cancer),
dimnames=list(cancer,cancer))
X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- fill
X[lower.tri(X)] <- NA
# p-values
pvalue <- rep(NA,times=max(Z))
for(l in seq_len(max(Z))){
x <- as.numeric(X[Z==l])
if(col[j]=="10"){
alternative <- "greater" # Never use "two.sided"!
}
if(col[j]=="Inf"){
alternative <- "less" # Never use "two.sided"!
}
pvalue[l] <- stats::wilcox.test(x=x,alternative=alternative,
exact=FALSE)$p.value
}
# Simes
M[i,j,k] <- palasso:::.combine(pvalue,method="simes")
}
}
}
# Table SIG: significance
constraint <- "10"
table <- format(M[,constraint,1:6],digits=1,scientific=FALSE)
for(i in seq_len(nrow(table))){
for(j in seq_len(ncol(table))){
if(M[i,constraint,j]>=0.05){
table[i,j] <- paste0("{\\color{gray}{",table[i,j],"}}")
}
}
}
one <- c("","\\text{standard}","","","\\text{adaptive}","")
two <- paste0("\\text{",c("x","z","xz","x","z","xz"),"}")
rownames(table) <- paste0("\\text{",rownames(table),"}")
table <- rbind(one,two,table,deparse.level=0)
rownames(table)[1] <- "~"
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
### Comparison with elastic net. ###
#rm(list=ls())
#<<functions>>
#<<collect>>
row <- c("gene","isoform","miRNA","CNV")
col <- c("10","50","Inf")
better <- worse <- less <- matrix(NA,nrow=length(row),ncol=length(col),
dimnames=list(row,col))
for(i in seq_along(row)){
for(j in seq_along(col)){
# proportion of improvements (cross-validation)
cond <- rownames(LOSS[[row[i]]]$deviance)==col[j]
loss <- LOSS[[row[i]]]$deviance[cond,c("paired","elastic")]
better[i,j] <- round(mean(loss[,"paired"]<loss[,"elastic"]),digits=2)
worse[i,j] <- round(mean(loss[,"paired"]>loss[,"elastic"]),digits=2)
# average difference in nzero (refitted models)
df_paired <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x) sum(x$nzero[x$nzero[,"model"]=="paired.adaptive",c("x","z")]))
df_elastic <- apply(LOSS[[row[i]]][[paste0("fit",col[j])]],1,function(x) sum(x$nzero[x$nzero[,"model"]=="elastic95",c("x","z")]))
df_diff <- df_elastic-df_paired
less[i,j] <- round(mean(df_diff),digits=2)
#graphics::hist(df_diff,main=paste(row[i],col[j]),xlim=c(-1,1)*max(abs(df_diff)))
}
}
better
worse
less
### Analysing the refitted models. ###
#rm(list=ls())
#<<functions>>
#<<collect>>
# Table SEL: selected model
nzero <- paste0("fit",c(5,10,Inf))
model <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")),
"between_xz","within_xz")
type <- c("gene","isoform","miRNA","CNV")
table <- array(NA,dim=c(length(nzero),length(model),length(type)),
dimnames=list(nzero,model,type))
for(i in seq_along(nzero)){
for(j in seq_along(model)){
for(k in seq_along(type)){
sub <- LOSS[[type[k]]][[nzero[i]]]
table[i,j,k] <- sum(sub[,"select"]==model[j])
}
}
}
colSums(table["fit10",,]) # CHECK WHETHER COMPLETE!
table <- round(prop.table(table["fit10",,],margin=2),digits=2)
table <- t(table)
table <- table[,apply(table,2,function(x) any(x!=0))]
rownames(table) <- paste0("\\text{",rownames(table),"}")
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
# selected weights and covariates
type <- c("gene","isoform","miRNA","CNV")
group <- c("x","z")
model <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")),
"paired.adaptive","elastic") # added "elastic" , paste0("elastic",c(100,75,50,25))
weights10 <- weightsInf <- matrix(NA,nrow=length(group),ncol=length(type),
dimnames=list(group,type))
coef10 <- coefInf <- array(NA,dim=c(length(group),length(type),length(model)),
dimnames=list(group,type,model))
for(i in seq_along(group)){
for(j in seq_along(type)){
weights10[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fit10[,"weights"],colMeans))
weightsInf[,j] <- rowMeans(sapply(LOSS[[type[j]]]$fitInf[,"weights"],colMeans))
for(k in seq_along(model)){
coef10[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fit10[,"nzero"], function(x) sum(x[x$model==model[k],group[i]])))
coefInf[i,j,k] <- mean(sapply(LOSS[[type[j]]]$fitInf[,"nzero"], function(x) sum(x[x$model==model[k],group[i]])))
}
}
}
# coef10["x",,]+coef10["z",,]
# with sparsity constraint
round(prop.table(weights10,margin=2),2)
round(prop.table(coef10[,,"paired.adaptive"],margin=2),2)
round(colSums(coef10[,,"paired.adaptive"]),2)
# natural sparsity
round(prop.table(weightsInf,margin=2),2)
round(prop.table(coefInf[,,"paired.adaptive"],margin=2),2)
round(colSums(coefInf[,,"paired.adaptive"]),2)
round(colSums(coefInf[,,"elastic"])/colSums(coefInf[,,"paired.adaptive"]),1) # multiple nzero of elastic net and paired lasso
# Table NSC: number of non-zero coefficients
table <- round(coefInf["x",,]+coefInf["z",,])
colnames(table)[7] <- "paired"
one <- c("","\\text{standard}","","","\\text{adaptive}","","\\text{paired}","\\text{elastic}")
two <- paste0("\\text{",c("x","z","xz","x","z","xz","xz","xz"),"}")
rownames(table) <- paste0("\\text{",rownames(table),"}")
table <- rbind(one,two,table,deparse.level=0)
rownames(table)[1] <- "~"
xtable <- xtable::xtable(table,align=c("r","|","c","c","c","|","c","c","c","|","c","|","c"))
xtable::print.xtable(xtable,type="latex",include.colnames=FALSE,sanitize.text.function=identity)
### FIGURE CSW ###
#rm(list=ls())
#<<functions>>
set.seed(1)
overfit <- TRUE
# simulate
n <- 10
cx <- stats::rbeta(n=n,shape1=0.9,shape2=1)
cz <- stats::rbeta(n=n,shape1=0.4,shape2=0.9)
# collection
x <- list()
y <- list()
# adaptive weights (X only)
x[[1]] <- rep(1,times=n)
if(overfit){x[[1]] <- cx}
y[[1]] <- rep(0,times=n)
# adaptive weights (Z only)
x[[2]] <- rep(0,times=n)
y[[2]] <- rep(1,times=n)
if(overfit){y[[2]] <- cz}
# adaptive weights (X and Z)
x[[3]] <- y[[3]] <- rep(0.5,times=n)
if(overfit){x[[3]] <- cx}
if(overfit){y[[3]] <- cz}
# within-pair weights
x[[4]] <- cx^2/(cx+cz)
y[[4]] <- cz^2/(cx+cz)
# visualisation
graphics::par(mfrow=c(1,4),mar=c(4.5,0.5,0.5,0.5),oma=c(0,2,0,0))
for(i in seq_len(4)){
palasso:::plot_pairs(x=x[[i]],y=y[[i]],lwd=4)
if(i==1){
graphics::mtext(text="covariate pair",side=2,line=1)
}
}
### FIGURE DIA ###
#rm(list=ls())
#<<functions>>
ellipse <- function(x,y,a=0.2,b=0.25,border=NA){
n <- max(c(length(x),length(y)))
if(length(x)==1){x <- rep(x,times=n)}
if(length(y)==1){y <- rep(y,times=n)}
if(length(border)==1){border <- rep(border,times=n)}
for(i in seq_len(n)){
angle <- seq(from=0,to=2*pi,length=100)
xs <- x[i] + a * cos(angle)
ys <- y[i] + b * sin(angle)
graphics::polygon(x=xs,y=ys,col=grey(0.9),border=border[i])
}
}
cancer <- c("ACC","BLCA","BRCA","UVM")
number <- c(80,409,1078,80)
col <- grDevices::rgb(red=0,green=0,blue=sample(seq(from=75,to=255,length.out=length(number))),maxColorValue=255)
lwd <- log(2*number)-2
lwd = pmax(5*number/max(number),1)
x1 <- 1.3; x2 <- 2; x3 <- 3
set.seed(1)
# first layer (bubble)
graphics::plot.new()
graphics::par(mar=c(0,0,0,0))
graphics::plot.window(xlim=c(1.1,3.3),ylim=c(-1.6,1.6))
ellipse(x=x1,y=0)
graphics::text(x=x1,y=0,labels="TCGA",font=2,adj=c(0.5,0))
graphics::text(x=x1,y=0,labels="n=9794",adj=c(0.5,1.2),cex=0.9)
# second layer (colon)
y1 <- seq(from=1,to=-1,length.out=length(cancer)+1)
graphics::text(x=x2,y=y1[length(cancer)],labels="...",font=2,srt=90)
y1 <- y1[-length(cancer)]
# first-second layer (connect)
graphics::segments(x0=x1+0.2,y0=0,x1=x2-0.2,y1=y1,lwd=lwd,col=col)
# second layer (bubble)
ellipse(x=x2,y=y1,a=0.2,b=0.2)
graphics::text(x=x2,y=y1,labels=cancer,font=2,col=col,adj=c(0.5,0))
graphics::text(x=x2,y=y1,labels=paste0("n=",number,""),adj=c(0.5,1.2),cex=0.9)
comb <- utils::combn(x=seq_along(y1),m=2)
# third layer (colon)
y2 <- seq(from=1.5,to=-1.5,length.out=ncol(comb)+1)
graphics::text(x=x3,y=y2[ncol(comb)],labels="...",font=2,srt=90)
y2 <- y2[-ncol(comb)]
# second-third layer (connect)
graphics::segments(x0=2.2,y0=y1[comb[1,]],x1=2.7,y1=y2,lwd=lwd[comb[1,]],col=col[comb[1,]])
graphics::segments(x0=2.2,y0=y1[comb[2,]],x1=2.7,y1=y2,lwd=lwd[comb[2,]],col=col[comb[2,]])
# third layer (bubble)
ellipse(x=x3,y=y2,a=0.3,b=0.22)
graphics::text(x=x3,y=y2,labels=paste0(cancer[comb[1,]]," "),
font=2,col=col[comb[1,]],adj=c(1,0))
graphics::text(x=x3,y=y2,labels=paste0(" ",cancer[comb[2,]]),
font=2,col=col[comb[2,]],adj=c(0,0))
graphics::text(x=x3,y=y2,labels=":",font=2,adj=c(0.5,0))
labels <- apply(comb,2,function(x) sum(number[x]))
labels <- paste0("n=",labels,"")
graphics::text(x=x3,y=y2,labels=labels,adj=c(0.5,1.2),cex=0.9)
### FIGURE CLA ###
#rm(list=ls())
#<<functions>>
graphics::par(mfrow=c(4,2),oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5))
for(type in c("gene","isoform","miRNA","CNV")){
loss <- LOSS[[type]][c("deviance","auc","class")]
choice <- "paired"
loss <- lapply(loss,function(x) x[,c(paste0("standard_",c("x","z","xz")),paste0("adaptive_",c("x","z","xz")),choice)])
for(constraint in c("10")){ # c("5","10","Inf")
# change
sub <- lapply(loss,function(x) x[rownames(x)==constraint,])
palasso:::plot_score(sub$deviance,choice=choice)
change <- sub$deviance[,7]-sub$deviance[,-7]
palasso:::plot_box(change,ylab="change",zero=TRUE,choice=NA)
# info
info <- list()
info$select <- names(which.min(apply(sub$deviance,2,median)[-7]))
info$DEV_paired <- median(sub$deviance[,choice])
info$DEV_select <- median(sub$deviance[,info$select])
info$improve <- mean(sub$deviance[,info$select]>sub$deviance[,choice])
info$AUC_paired <- median(sub$auc[,choice])
info$CLASS_paired <- median(sub$class[,choice])
print(as.data.frame(info)) # important
}
}
### FIGURE DEC ###
#rm(list=ls())
#<<functions>>
#graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(1,1))
graphics::par(oma=c(1.0,1.0,0,0),mar=c(1.5,3.0,0.5,0.5),mfrow=c(2,2))
for(type in c("gene","isoform","miRNA","CNV")){
models <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")),"paired")
constraint <- c("3","4","5","10","15","20","25","50","Inf")
loss <- LOSS[[type]]["deviance"]
loss <- lapply(loss,function(x) x[,models])
table <- matrix(NA,nrow=length(constraint),ncol=length(models),
dimnames=list(constraint,models))
for(i in seq_along(constraint)){
sub <- lapply(loss,function(x) x[rownames(x)==constraint[i],])
table[i,] <- apply(sub$deviance,2,median)
}
# table <- log(table)
graphics::plot.new()
graphics::plot.window(xlim=c(1,length(constraint)),ylim=range(table))
graphics::box()
constraint[constraint=="Inf"] <- "n"
graphics::axis(side=2)
graphics::axis(side=1,at=seq_along(constraint),labels=constraint,tick=FALSE,line=-1)
for(k in c(1,2)){
for(i in seq_along(models)){
lty <- ifelse(i%in%c(1,2,3),3,ifelse(i%in%c(4,5,6),2,1))
col <- ifelse(i==7,"#00007F","#FF3535")
pch <- ifelse(i%in%c(1,4),"x",ifelse(i%in%c(2,5),"z",1))
if(k==1){
graphics::lines(table[,i],col=col,lty=lty,lwd=2)
graphics::points(table[,i],col="white",pch=16,cex=1.2)
} else {
graphics::points(table[,i],col=col,pch=1,font=2)
}
}
}
}
graphics::title(ylab="deviance",line=0.0,outer=TRUE)
graphics::title(xlab="sparsity constraint",ylab="deviance",line=0.0,outer=TRUE)
### FIGURE CNV ###
#rm(list=ls())
#<<functions>>
graphics::par(oma=c(0,0,0,0),mar=c(2.1,3.5,0.5,0.5))
graphics::layout(matrix(c(1,1,2,2),nrow=1))
loss <- LOSS[[type]][c("deviance","auc","class")]
loss <- lapply(loss,function(x) x[rownames(x)=="10",])
model <- c(paste0("standard_",c("x","z","xz")),
paste0("adaptive_",c("x","z","xz")))
diff <- loss$auc[,"paired"]-loss$auc[,model]
palasso:::plot_box(diff,zero=TRUE,invert=TRUE,ylab="change")
diff <- loss$class[,"paired"]-loss$class[,model]
palasso:::plot_box(diff,zero=TRUE,ylab="change")
### FIGURE MAP ###
#rm(list=ls())
#<<functions>>
loss <- LOSS[["CNV"]][c("info","auc")]
cancer <- sort(unique(c(levels(loss$info$y0),levels(loss$info$y1))))
X <- matrix(NA,nrow=length(cancer),ncol=length(cancer),dimnames=list(cancer,cancer))
#Z <- palasso:::.design(x=cancer)
y0 <- as.character(loss$info$y0)
y1 <- as.character(loss$info$y1)
X[cbind(y0,y1)] <- X[cbind(y1,y0)] <- loss$auc[rownames(loss$auc)=="10","paired"]
graphics::par(mar=c(0.5,3.0,3.0,0.5))
dimnames(X) <- lapply(dimnames(X),function(x) paste0(" ",x," "))
palasso:::plot_table(X=X,margin=-1,labels=FALSE,las=2,cex=0.7)
#sort(rowMeans(X,na.rm=TRUE),decreasing=TRUE)[1:2] # keep!
### FIGURE COM ###
# 32 cancer types for isoform and miRNA
# 33 cancer types for gene and CNV
#rm(list=ls())
#<<functions>>
for(type in c("miRNA")){
cancer <- sort(unique(as.character(unlist(LOSS[[type]]$info[,c("y0","y1")]))))
n <- length(cancer)
z <- as.numeric(palasso:::.design(x=n))
x <- rep(seq_len(n),each=n)
y <- rep(seq(from=n,to=1,by=-1),times=n)
pch <- z
pch[pch==0] <- NA
pex <- c(".","O","*","+","o","-","'","x")
# colour
base <- grDevices::colorRampPalette(colors=c('darkblue','blue','red','darkred'))(n)
col <- rep(NA,times=length(z))
col[z==0] <- "white"
for(i in seq_len(n)){
col[z==i] <- base[i]
}
graphics::par(mfrow=c(1,1),mar=c(0,0,2,2))
graphics::plot.new()
graphics::plot.window(xlim=c(1,n),ylim=c(1,n))
graphics::points(x=x[pch<=25],y=y[pch<=25],
pch=pch[pch<=25],col=col[pch<=25],cex=0.9)
graphics::points(x=x[pch>25],y=y[pch>25],
pch=pex[(pch-25)[pch>25]],col=col[pch>25],cex=0.9)
graphics::segments(x0=0,x1=n+1,y0=n+1)
graphics::segments(x0=n+1,y0=n+1,y1=0)
graphics::segments(x0=0,x1=n+1,y0=n+1,y1=0,lty=2)
graphics::mtext(text=cancer,side=3,at=1:n,las=2,cex=0.7)
graphics::mtext(text=cancer,side=4,at=n:1,las=2,cex=0.7)
}
These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.