In this writeup, I go through some R coding exercises and statistical concepts relating to Bioinformatics. These notes/exercise are mainly from PHX525.3x, one of eight bionformatics courses, with completeion deadline Sept15.

Inference: random variables, false positives, multiple hypothesis testing, fdr, qvalues

#Mouse experiment giving 20 diets
set.seed(1)
pvals <- replicate(1000,{
  control = sample(population[,1],12)
  treatment = sample(population[,1],12)
  t.test(treatment,control)$p.val
})
mean(pvals<0.05) # mean(pvals<0.01)
## [1] 0.041
# control and treatment are equivalent, therefore null hyothesis are all true, pvals are uniform
fig.width=5
hist(pvals)

#diets
#Now run a Monte Carlo simulation imitating the results for the experiment for all 20 diets. If you set the seed at 100, set.seed(100), and use the same code as above inside a call to replicate how many of p-values are below 0.05? 1
set.seed(100)
diets<-replicate(20, {cases = rnorm(10,30,2)
controls = rnorm(10,30,2)
t.test(cases,controls)$p.val}<0.05)

mean(diets)
## [1] 0.05
#Number of pvalues(out of 20) FP
mean(diets)*20
## [1] 1
#What is the average of these 1,000 numbers? Note that this is the expected number of tests (out of the 20 we run) that we will reject when the null is true. (Hint: use replicate twice) 1.007
set.seed(100)
mcavg = replicate(1000, { mean(diets)*20 }  )
mean(mcavg)
## [1] 1
#For what proportion of the 1,000 replications do we reject the null hypothesis at least once (more than 0 false positives)? (Enter your response as a decimal value -- i.e. 0.10 for 10%.) .632
mean(replicate(1000,  { mean( replicate(20, {cases = rnorm(10,30,2)
controls = rnorm(10,30,2) 
#although the avg/sample(20) is 1, over 1000 samples, over half will contain at least one FP
t.test(cases,controls)$p.val} < 0.05) )} >0) )
## [1] 0.646
#Q1.3.2, previously learned that under the null, the probability of a p-value < p is p. If we run 8,793 independent tests, what is the probability of incorrectly rejecting at least one of the null hypotheses?
#B<-1000
#minpval <- replicate(B, min(runif(8793,0,1))<0.05)
#mean(minpval>=1)
alpha=0.05
1-(1-alpha)^8793
## [1] 1
#sidak cutoff, previously 0.05, to lower our probability of at least one mistake to be 5%.
1 - (1-alpha)^1/8793
## [1] 0.999892
#Q1.4.2 8,793 t-tests, set the cutoff using the Bonferroni correction and report back the FWER. Set the seed at 1,set.seed(1) and run 10,000 simulation.

set.seed(1)
# get each pvals < cutoff as mean
bonferonni <- function(n) {
  return( mean(runif(n,0,1)<(0.05/n) ) )
  }
r=replicate(10000,bonferonni(8793))
#get number with mean>0 as proportion over 10K
mean(r>0)
## [1] 0.0464
#more conservative, less power
sidak <- function(n) {
  cutoff=(1-(1-0.05)^(1/n))
  return( mean(runif(n,0,1)<cutoff ) )
  }
rs = replicate(10000,sidak(8793))
mean(rs>0)
## [1] 0.0517
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## 
## The following object is masked from 'package:stats':
## 
##     xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, as.vector, cbind,
##     colnames, do.call, duplicated, eval, evalq, Filter, Find, get,
##     intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rep.int, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unlist, unsplit
## 
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Creating a generic function for 'nchar' from package 'base' in package 'S4Vectors'
## 
## Attaching package: 'genefilter'
## 
## The following object is masked from 'package:base':
## 
##     anyNA
#FDR, qvals, mc procedure of fp,fn
#Q1.5.1 Compute a p-value for each gene using the function rowttests from the genefilter package in Bioconductor. how many less than 0.05?
g <- factor(sampleInfo$group)
results <- rowttests(geneExpression,g)
pvals <- results$p.value
mean( pvals <0.05)
## [1] 0.1572842
table(pvals<0.05)
## 
## FALSE  TRUE 
##  7410  1383
#Bonferroni gives more conservative, gives list with no false positive:
#Q1.5.2 bonferroni correction how many genes significant?
bnfcutoff = 0.05/(length(pvals))
table(pvals<bnfcutoff)
## 
## FALSE  TRUE 
##  8783    10
#Q1.5.2 q-vals padjust()
#qvalue is just fdr of ranked len-vector=contain that gene/feature in ranking
?p.adjust()
table(p.adjust(pvals, method="fdr")<0.05)
## 
## FALSE  TRUE 
##  8780    13
#FDR less conservative than Bonferonni, but gives list that has about 5% FP.
#Q1.5.3 Number genes FDR < 0.05
?qvalue()
qvals=qvalue(pvals)$qvalues
qvalsfdr=qvalue(pvals, fdr.level=0.05)$significant
table(qvalsfdr)
## qvalsfdr
## FALSE  TRUE 
##  8771    22
#pi0=proportion of null-hypothesis which are true vs rejected falsely(fp)
pinull<-qvalue(pvals)$pi0
pinull
## [1] 0.6695739
#fdr adjusted pi0
qobj <- qvalue(pvals, fdr.level=0.05, pi0.method="bootstrap", adj=1.2)$pi0
qobj
## [1] 0.7038066
#pi0 tells us proportion of null that looks uniform
hist(pvals,breaks=seq(0,1,len=21))
expectedfreq <- length(pvals)/20 #per bin
abline(h=expectedfreq*qvalue(pvals)$pi0,col=2,lty=2)

#qvalue gives less conservative estimate of proportion of True Nulls(Ho)
#qvalue as proportion(FP:TrueNull)/pvalue-estimate of FP == True Nulls
plot(qvalue(pvals)$qvalue/p.adjust(pvals,method="fdr") ,ylab="qval:pval",main="Null Hypothesis Estimate",type="o" , pch=10, cex=.2)
abline(h=qvalue(pvals)$pi0,col=2)

#Q1.5.7 fp rate of bonferonni m=8793, m0=8293 and m1=500
#,793 genes for 24 samples: 12 cases and 12 controls. 
set.seed(1)
n <- 24
m <- 8793
mat <- matrix(rnorm(n*m),m,n)
delta <- 2
?rowttests
positives <- 500
mat[1:positives,1:(n/2)] <- mat[1:positives,1:(n/2)]+delta
rb=replicate(1000, list({ 
  g1<-rep(1,12)
  g0<-rep(0,12)
  g<-c(g1,g0)
  results <- rowttests(mat,factor(g))$p.value
  #bnf=p.adjust(results,method="bonferroni")
  #bonferonni_fp <-length(which(bnf[501:8793]<0.05))/8293
  #bonferonni_fn <- length( which(bnf[1:500]>0.05) )/500
  
  #qval=p.adjust(results,method="fdr")
  #qval_fp <- length( which(qval[501:8793]<0.05) )/8293
  #qval_fn <- length( which(qval[1:500]>0.05) )/500
  
  qqval=qvalue(results)$qvalues
  qqval_fn <- length( which(qqval[1:500]>0.05) )/500
  qqval_fp <- length( which(qqval[501:8793]<0.05) )/8293
  #qq<-list(fn=qqval_fn, fp=qqval_fp)
  #return qq
  }))
#head(rb$fp)
#tail(rb)
#dim(rb)
#length(rb)
#mean(rb[1])
#mean(rb[2])

#tst<- c(0,0,1)
#tst[2:3]
#mean(tst>0)
#mean(tst[2:3]>0)
ttestgenerator <- function(n) {
  # note that here we have a false "smokers" group where we actually
  # sample from the nonsmokers. this is because we are modeling the *null*
  smokers = sample(dat$bwt[dat$smoke==0], n)
  nonsmokers = sample(dat$bwt[dat$smoke==0], n)
  return((mean(smokers)-mean(nonsmokers))/sqrt(var(smokers)/n + var(nonsmokers)/n))
  }
#ttests <- replicate(1000, ttestgenerator(10))

Exploratory Data Analysis: QQPlot, MA-plot

## Loading required package: affy
e <- exprs(mas133)
plot(e[,1],e[,2],main=paste0("corr=",signif(cor(e[,1],e[,2]),3)),cex=0.5)
k <- 3000
b <- 1000 #a buffer
polygon(c(-b,k,k,-b),c(-b,-b,k,k),col="red",density=0,border="red")

#proportion of points in rectangle
xx=(e[,1]>-b) & (e[,1]<k)
yy=(e[,2]>-b) & (e[,2]<k)
xy = xx & yy
mean(xy)
## [1] 0.9475336
#Q1.6.3
#log-transform removes tails from dominating plot, 95% of data not in tiny plot
e <- log2(exprs(mas133))
plot((e[,1]+e[,2])/2,e[,2]-e[,1],cex=0.5,main='MA-plot',xlab='log-average',ylab='log-difference')

#What is log-ratio sd, as in MA-plot?
#High correlations are not evidence of replication. Replication implies small difference between the observations which is better measured with distance or differences.
#A fold-change of 2 relates to absolute differences of 1
sdlog=-( e[,2]-e[,1] ) #  +( (e[,1]+e[,2])/2 )
#sd(sdlog)
#head(sdlog)
sd1 = abs(e[,2]-e[,1])>1
sum(sd1)
## [1] 3057
length(sd1)
## [1] 22300
summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
load("skew.RData")
par(mfrow=c(3,3))
for (i in 1:9) { qqnorm( dat[,i] ) }