packages

Phenotypic variation

data(lgrc.expr.meta)
class(expr.meta)
## [1] "data.frame"
colnames(expr.meta)
## [1] "tissueid"    "sample_name" "newid"       "GENDER"      "age"        
## [6] "cigever"     "pkyrs"       "diagmaj"     "gender"
table(expr.meta$GENDER)
## 
##   1-Male 2-Female 
##      119      110
#Q2 median of distribution packs smoked (male+female)?
median(expr.meta$pkyrs)
## [1] 40
# is pkyrs normal distribution?
dev.off()
## null device 
##           1
mypar(2,1)
hist(expr.meta$pkyrs)
qqnorm(expr.meta$pkyrs)

dna variation (snp, )

class(BSgenome.Hsapiens.UCSC.hg19)
## [1] "BSgenome"
## attr(,"package")
## [1] "BSgenome"
#chromosome11
chr11seq <- BSgenome.Hsapiens.UCSC.hg19[["chr11"]]
subseq(chr11seq,start=10^6,width=25)
##   25-letter "DNAString" instance
## seq: GTTTCACCGTGTTAGCCAGGGTGGT
#Q: Read the help file for the fuction countPattern and tell us which of the following sequences is most common on chromosome 11: "ATG", "TGA", "TAA", and "TAG"
?countPattern
#countPattern("ATG", chr11seq)
countPatternList<- function(d){
  return( countPattern(d,chr11seq))
}
sort(sapply( c("ATG","TGA","TAA","TAG"), countPatternList  ))
##     TAG     ATG     TGA     TAA 
## 1689356 2389002 2561021 2624324
#Q: Now we move to a question about chromosome 7. Read the help page for the function alphabetFrequency and use it to determine what percent of chromosome 7 is T,C,G, and A. Note that we have other letters. For example N, which represents positions that are not called, appears often. What proportion are Cs (including counts of N in the total)
chr7seq <- BSgenome.Hsapiens.UCSC.hg19[["chr7"]]
af<-alphabetFrequency(chr7seq, as.prob=FALSE )
af['C'] / sum(af)
##         C 
## 0.1990193
#snps on chrmosome17
s17 = getSNPlocs("ch17")
head(s17)
##   RefSNP_id alleles_as_ambig loc
## 1 145615430                Y  56
## 2 148170422                S  78
## 3 183779916                R  80
## 4 188505217                R 174
## 5   9747578                R 293
## 6 200398583                W 300
#Q: What is the location on chr17 of SNP rs73971683?
s17[which( s17$RefSNP_id==73971683 ) ,]
##      RefSNP_id alleles_as_ambig    loc
## 3228  73971683                R 135246
##(with dplyr) s17 %>% filter(RefSNP_id=="73971683") %>% select(loc)

gene expression

data(tissuesGeneExpression)
head(e[,1:5])
##           GSM11805.CEL.gz GSM11814.CEL.gz GSM11823.CEL.gz GSM11830.CEL.gz
## 1007_s_at       10.191267       10.509167       10.272027       10.252952
## 1053_at          6.040463        6.696075        6.144663        6.575153
## 117_at           7.447409        7.775354        7.696235        8.478135
## 121_at          12.025042       12.007817       11.633279       11.075286
## 1255_g_at        5.269269        5.180389        5.301714        5.372235
## 1294_at          8.535176        8.587241        8.277414        8.603650
##           GSM12067.CEL.gz
## 1007_s_at       10.157605
## 1053_at          6.606701
## 117_at           8.116336
## 121_at          10.832528
## 1255_g_at        5.334905
## 1294_at          8.303227
#Look at the data for the feature with ID "209169_at". You can index the rows of 'e' directly with this character string.
#Q: Which of the following best describes the data? (Hint: stratify data by tissue and create boxplots)
boxplot(e["209169_at",] ~ tissue, las=2, main="209169_at brain differentially expressed" ) #vertical ylabel

#Which of the following ID(s) appear to represent a gene specific to placenta? 
IDs = c("201884_at", "209169_at", "206269_at", "207437_at", "219832_s_at", "212827_at")
plc <- function(d){
  ep<-split(e[d,] , tissue)$placenta   
  return(ep)
  }
placentas <- sapply(IDs,plc)
mypar(4,2)
for(i in IDs){
 boxplot(e[i,]~tissue,las=2, main=i)
}
boxplot(placentas, main="placenta")

# Identifiers are provided by the manufacturer of the machine that makes the measurements. 
# gene "206269_at" what is function? Where is it on genome? What is sequence? 
# Bioconductor is that it connect R, an existing comprehensive toolbox for data analysis to databases annotating the genome.
#for microarry  Affymetirx Human GeneChip HG133A. Search the Bioconductor website and determine which of the following packages provides a connection to gene information: hgu133a.db
# 3 tables: 1.expression(rows=genes, cols=tissue), 2.phenotype(rows=tissue), 3. describe gens(rows=genes)

#library(GSE5859)
#data(GSE5859)
#class(e)
#dat = exprs(e)
#dim(dat)
#sampleInfo = pData(e)
#dim(sampleInfo)
#head(sampleInfo)

#library(hgfocus.db)
#annot = select(hgfocus.db, 
#               keys=featureNames(e), 
 #              keytype="PROBEID", 
  #             columns=c("CHR", "CHRLOC", "SYMBOL"))
## here we pick one column from the annotation
#annot = annot[ match(featureNames(e),annot$PROBEID), ]
#head(annot)
#dim(annot)

molecular biology

Annotate

# # using a database object
# # OrganismDb object Homo.sapiens contains annotation information about human genes, in particular a series of relations between  identifiers about human genes. Some of the identifiers can be provided as "keys", that is, as values to look up in the database, in order to find pieces of information which match that key. We can list all the possible types of keys:
# keytypes(Homo.sapiens)
# #There are also columns in the database, not all of which are keys.
# columns(Homo.sapiens)
# #Two widely used catalogs of genes are NCBI Entrez Gene, and EBI ENSEMBL.
class(Homo.sapiens)
?OrganismDb
#entrez ids
length(unique(keys(Homo.sapiens, keytype="ENTREZID")))
#47721
#Q: Substitute keytype "ENSEMBL" in the keys command of the previous problem. How many unique Ensembl identifiers are found?:

length(unique(keys(Homo.sapiens, keytype="ENSEMBL")))

#We can use the select() function to look up specific values in the database.
#What is the Ensembl ID of the gene with the Entrez ID "9575"? => CLOCK
select(Homo.sapiens, key="9575", keytype="ENTREZID", columns=c("SYMBOL", "ENSEMBL", "ENTREZID", "CHR"))

# GO circadian
# Looking up genes by function 
tab = select(Homo.sapiens, key="circadian rhythm", keytype="TERM", columns=c("ENTREZID"))
head(tab)
length(unique(tab$ENTREZID) )
##packageVersion("Homo.sapiens")

#Q How many unique Entrez IDs are associated with circadian rhythm according to this GO term?
tab = select(Homo.sapiens, key="circadian rhythm", keytype="TERM", columns=c("ENTREZID"))
length(unique(tab$ENTREZID))

Expression Set

#library(biobase)
data(sample.ExpressionSet)
sample.ExpressionSet
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 500 features, 26 samples 
##   element names: exprs, se.exprs 
## protocolData: none
## phenoData
##   sampleNames: A B ... Z (26 total)
##   varLabels: sex type score
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
samp = sample.ExpressionSet
#phenotype
table(pData(samp)$sex)
## 
## Female   Male 
##     11     15
#expression data
#experimental data matrix is stored in exprs(samp)

#Question 1.6.1
#Make a new ExpressionSet which contains the subset of the samples which are female. Remember you can subset the ExpressionSet object directly with [ and ]. What is the sum of the first row of experiment data, accessed by exprs(), for only the female samples?
#subset females, what is expr sum
class(samp)
## [1] "ExpressionSet"
## attr(,"package")
## [1] "Biobase"
##exprs( samp[pData(samp)$sex=="Female"] )[1,]
##sum( exprs( samp[pData(samp)$sex=="Female"] )[1,] )

female = samp[,samp$sex == "Female"]
sum(exprs(female[1,]))
## [1] 1231.46
# Q: find experiment data
experimentData(samp)
## Experiment data
##   Experimenter name: Pierre Fermat 
##   Laboratory: Francis Galton Lab 
##   Contact information: pfermat@lab.not.exist 
##   Title: Smoking-Cancer Experiment 
##   URL: www.lab.not.exist 
##   PMIDs:  
## 
##   Abstract: A 8 word abstract is available. Use 'abstract' method.
##   notes:
##    notes:     
##       An example object of expression set (exprSet) class
annotation(samp)
## [1] "hgu95av2"
#What is the correlation - the Pearson correlation given by cor() - of the 'score' information for the samples and the experiment data for feature '31489_at'?
head(samp)
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1 features, 26 samples 
##   element names: exprs, se.exprs 
## protocolData: none
## phenoData
##   sampleNames: A B ... Z (26 total)
##   varLabels: sex type score
##   varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation: hgu95av2
samp$type
##  [1] Control Case    Control Case    Case    Control Case    Case   
##  [9] Case    Control Case    Control Case    Case    Case    Control
## [17] Case    Control Case    Case    Control Control Control Control
## [25] Case    Case   
## Levels: Case Control
edata=exprs(samp)["31489_at",]
cor(samp$score, edata)
## [1] 0.1376892

Genomic Ranges

#biocVersion()
#library(ERBS)
#data(HepG2)

#In the video we used the values method to extract meta-data on the regions. An alternative, and actually preferred approach going forward, is mcols.
#What is the median of the signalValue column?

median(mcols(HepG2)$signalValue)
## [1] 7.024
#In what chromosome is the region with the highest signalValue (copy and paste your answer)?
seqnames( HepG2[which.max(mcols(HepG2)$signalValue)  ] )
## factor-Rle of length 1 with 1 run
##   Lengths:    1
##   Values : chrX
## Levels(93): chr1 chr2 chr3 ... chrUn_gl000248 chrUn_gl000249
# How many regions are from chromosome 16?
length( HepG2[seqnames(HepG2)=="chr16"] )
## [1] 31
sum( seqnames(HepG2)=="chr16")
## [1] 31
?seqnames

#Make a histogram of the widths of the regions from all chromosomes (not just chr16). Note it has a heavy right tail.
#What is the median width?```
#widths <- function()
ranges(HepG2)
## IRanges of length 303
##           start       end width
## [1]    20335378  20335787   410
## [2]      328285    329145   861
## [3]   168135432 168136587  1156
## [4]     1244419   1245304   886
## [5]    64071828  64073069  1242
## ...         ...       ...   ...
## [299]   1797182   1797852   671
## [300] 110198573 110199126   554
## [301]  17734052  17734469   418
## [302]  48306453  48306908   456
## [303] 123867207 123867554   348
width(HepG2)
##   [1]  410  861 1156  886 1242  973  498  477  942  553 1042  667  621 1099
##  [15]  783  579  569 1358 1033  918  586  807  619  396  412  408  501 1048
##  [29]  612  503  397 1186  543  438  633  425  542 1825  403  401  493  422
##  [43]  459  644 1212  532  608  634  970  418  637 2138  665  489  576  425
##  [57] 1987  389  614  815  598  383  413 1146  469 1131  712  930  423  393
##  [71]  923  609  516  524  441  766  400  560  988  419  397  570 1048  479
##  [85]  931  445  432  542  527  441  367  709  568  417  473  354  843  365
##  [99]  736  416  441  549  588  459  551  465  516  402  503  838  529  366
## [113]  416  824  441  447  397  484  380  380  412  475  707  767  381  525
## [127]  805  380  634 1172  633  418  411 1098  547 1225  623  602  478  801
## [141]  480 1208  690  569  749  361  975  496  476  347  954  743  358  563
## [155]  379 1519  700  642  833  576  489  628  335  671  456  390  418  679
## [169]  650  451 1000  458  499  610  762  598 1034  834  544  479  465 1292
## [183]  483  427  491  490  348  459  566  488  366  430  746  446  401  768
## [197]  705 1056  921  439  778 1245  472  549  449  791 2048  688  492  769
## [211]  640  594  340  423  401  435  692  515 1065  397  644  445  839  459
## [225]  530  584  600  318  467  696  391  575  582  421  439 1360  922  418
## [239]  880  349  949  529  325  773  604 1342  385  651  714  336  773  553
## [253]  647  387 1159  809  544  572  499  637 1224  447  857  518  923  613
## [267]  711  384  619  432  637  848 3866 1247  331  315  324  706  408  360
## [281]  504  345  460  697  429  674  551  635 1025  678  805  383  751 1037
## [295]  997  571 1168  380  671  554  418  456  348
med<-median(width(HepG2))
mypar(1,1)
hist(width(HepG2), main=paste("region width over all chr, median=",med), nclass=25)

IRanges

ir <- IRanges(101,200)
start(ir)
## [1] 101
#zoom width
ir
## IRanges of length 1
##     start end width
## [1]   101 200   100
ir*2
## IRanges of length 1
##     start end width
## [1]   126 175    50
ir*3
## IRanges of length 1
##     start end width
## [1]   134 166    33
#Define an integer range starting at 101 and ending at 200. If we use the operation narrow(x, start=20), what is the new starting point of the range?
narrow(ir,start=20)
## IRanges of length 1
##     start end width
## [1]   120 200    81
?narrow
## Help on topic 'narrow' was found in the following packages:
## 
##   Package               Library
##   IRanges               /usr/local/lib/R/site-library
##   XVector               /usr/local/lib/R/site-library
##   GenomicRanges         /usr/local/lib/R/site-library
##   GenomicAlignments     /usr/local/lib/R/site-library
## 
## 
## Using the first match ...
#Define an integer range starting at 101 and ending at 200. If we use the operation +25, what is the width of the resulting range?
ir+25
## IRanges of length 1
##     start end width
## [1]    76 225   150
ir+50
## IRanges of length 1
##     start end width
## [1]    51 250   200
#Define an IRanges with starts at 1,11,21 and ends at 3,15,27. width() gives the widths for each range. What is the sum of the widths of all the ranges?
a<-IRanges(start=c(1,11,21),end=c(3,15,27))
sum(width(a))
## [1] 15
#Define an IRanges object, x, with the following set of ranges:
#Starts at 101,106,201,211,221,301,306,311,351,361,401,411,501
#Ends at 150,160,210,270,225,310,310,330,390,380,415,470,510
io <- IRanges(start=c(101,106,201,211,221,301,306,311,351,361,401,411,501),end=c(150,160,210,270,225,310,310,330,390,380,415,470,510) )
#Plot these ranges using the plotRanges function in the ph525x package.
plotRanges(io)
#What is the total width from 101 to 510 which is not covered by ranges in x?
start(io)+width(io)
##  [1] 151 161 211 271 226 311 311 331 391 381 416 471 511
sum(width(gaps(reduce(io))))
## [1] 130
#How many disjoint ranges are contained within the ranges in 'x' from the previous question? By disjoint ranges, we mean the following: for two ranges [1,10] and [6,15], there are three disjoint ranges contained within: [1,5], [6,10], and [11,15].
disjoin(io)
## IRanges of length 17
##      start end width
## [1]    101 105     5
## [2]    106 150    45
## [3]    151 160    10
## [4]    201 210    10
## [5]    211 220    10
## ...    ... ...   ...
## [13]   381 390    10
## [14]   401 410    10
## [15]   411 415     5
## [16]   416 470    55
## [17]   501 510    10
length(disjoin(io))
## [1] 17
?disjoin
## Help on topic 'disjoin' was found in the following packages:
## 
##   Package               Library
##   IRanges               /usr/local/lib/R/site-library
##   GenomicRanges         /usr/local/lib/R/site-library
## 
## 
## Using the first match ...
#plot resize
par(mfrow=c(2,1))
plotRanges(io)
plotRanges( resize(io,1))

#http://genomicsclass.github.io/book/pages/iranges_granges.html

GeneRanges cont

  1. the chromosome we are referring to (in Bioconductor, this is called “seqnames”)
  2. the strand of the DNA we are referring to (“+” or “-”). No strand is labelled with a star, “*“.
x = GRanges("chr1", IRanges(c(1,101),c(50,150)), strand=c("+","-"))
ranges(x)
## IRanges of length 2
##     start end width
## [1]     1  50    50
## [2]   101 150    50
plotGRanges = function(x) plotRanges(ranges(x))
plotGRanges(x)

plotGRanges(resize(x,1))

#Compare x and resize(x,1) using plotGRanges. The result of running resize(x,1) is two ranges of width 1 which start...

#isoforms
#Suppose we have two different sets of ranges, which overlap somewhat but not entirely. This is the case for many genes, in which there are different versions of transcripts, also called isoforms. The different transcripts consist of exons which end up in the final mRNA molecule, and a number of transcripts can share exons or have exons which are overlapping but not identical ranges.

#We'll start with a toy example, and learn how to load real genes later:

x = GRanges("chr1", IRanges(c(101,201,401,501),c(150,250,450,550)), strand="+")
y = GRanges("chr1", IRanges(c(101,221,301,401,541),c(150,250,350,470,550)), strand="+")
par(mfrow=c(2,1))
plotGRanges(x)
plotGRanges(y)

#Find the total width which is covered by ranges in both x and y. Hint: use c(), disjoin() and %over%.
#logical vector of overlap
head(x)
## GRanges object with 4 ranges and 0 metadata columns:
##       seqnames     ranges strand
##          <Rle>  <IRanges>  <Rle>
##   [1]     chr1 [101, 150]      +
##   [2]     chr1 [201, 250]      +
##   [3]     chr1 [401, 450]      +
##   [4]     chr1 [501, 550]      +
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths
#?disjoin()
#create pieces
dis=disjoin(c(x,y))
#get index of overlap
overlap=dis %over% x & dis %over% y
#get width
sum(width(dis[overlap]))
## [1] 140
#get width non-overlap
nonoverlap = !(dis %over% x & dis %over% y)
sum(width(dis[nonoverlap]))
## [1] 130
# Define a new genomic range, 'z', which covers range(ranges(x)) but has the opposite strand.
# What is the number of ranges in x which overlap z according to the %over% command?
range( ranges(x))
## IRanges of length 1
##     start end width
## [1]   101 550   450
ranges(x)
## IRanges of length 4
##     start end width
## [1]   101 150    50
## [2]   201 250    50
## [3]   401 450    50
## [4]   501 550    50
z = GRanges("chr1", range(ranges(x)), strand="-")
ranges(z)
## IRanges of length 1
##     start end width
## [1]   101 550   450
sum(x %over% z)
## [1] 0

overlap genes

data(HepG2)
data(GM12878)

head(HepG2)
## GRanges object with 6 ranges and 7 metadata columns:
##       seqnames                 ranges strand |      name     score
##          <Rle>              <IRanges>  <Rle> | <numeric> <integer>
##   [1]     chr2 [ 20335378,  20335787]      * |      <NA>         0
##   [2]    chr20 [   328285,    329145]      * |      <NA>         0
##   [3]     chr6 [168135432, 168136587]      * |      <NA>         0
##   [4]    chr19 [  1244419,   1245304]      * |      <NA>         0
##   [5]    chr11 [ 64071828,  64073069]      * |      <NA>         0
##   [6]    chr20 [ 22410891,  22411863]      * |      <NA>         0
##             col signalValue    pValue       qValue      peak
##       <logical>   <numeric> <numeric>    <numeric> <integer>
##   [1]      <NA>      58.251    75.899 6.143712e-72       195
##   [2]      <NA>      10.808    69.685 5.028065e-66       321
##   [3]      <NA>      17.103    54.311 7.930665e-51       930
##   [4]      <NA>      12.427    43.855 1.359756e-40       604
##   [5]      <NA>       10.85    40.977 7.333863e-38       492
##   [6]      <NA>       6.419     41.02 7.749606e-38       660
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
class(HepG2)
## [1] "GRanges"
## attr(,"package")
## [1] "GenomicRanges"
#Where does the 17th HepG2 region start?
head(mcols(HepG2))
## DataFrame with 6 rows and 7 columns
##        name     score       col signalValue    pValue       qValue
##   <numeric> <integer> <logical>   <numeric> <numeric>    <numeric>
## 1        NA         0        NA      58.251    75.899 6.143712e-72
## 2        NA         0        NA      10.808    69.685 5.028065e-66
## 3        NA         0        NA      17.103    54.311 7.930665e-51
## 4        NA         0        NA      12.427    43.855 1.359756e-40
## 5        NA         0        NA      10.850    40.977 7.333863e-38
## 6        NA         0        NA       6.419    41.020 7.749606e-38
##        peak
##   <integer>
## 1       195
## 2       321
## 3       930
## 4       604
## 5       492
## 6       660
#HepG2[17,]
start(HepG2[17,])
## [1] 46528596
#Use distanceToNearest to find the closest region in GM12878 to the 17th region in HepG2. What is the start site of this region?
dtn<-distanceToNearest(HepG2[17],GM12878)
start( GM12878[subjectHits(dtn)] )
## [1] 46524762
#What is the distance between the 17th region of HepG2 and its closest region in GM12878?
mcols(dtn)$distance
## [1] 2284
#For each region in HepG2 find the closest region in GM12878 and record the distance. What proportion of these distances are smaller than 2000 base pairs? Distance is a metadata column on the Hits object, so consider mcols().
d<-distanceToNearest(HepG2,GM12878)
mean(mcols(d)$distance <2000)
## [1] 0.2673267

genes as GRanges

ghs = genes(Homo.sapiens)
#What genome build was used here?
#ghs

#How many genes are represented in ghs?
#length(unique(mcols(ghs)$GENEID))
length(ghs)
## [1] 23056
#What is the chromosome with the most genes?
tail(sort(table(seqnames(ghs))))
## 
##  chr3 chr17 chr11  chr2 chr19  chr1 
##  1271  1337  1439  1464  1607  2326
which.max(sort(table(seqnames(ghs))))
## chr1 
##   93
#Make a histogram of the widths of genes (use the width() on the GRanges object). This width gives the number of basepairs from the start of the gene to the end, so including exons and introns.
#mypar(2,1)
#histogram(width(ghs), breaks=100 )
#histogram(log10(width(ghs)))
#Which best describes the width of genes?
#fat-tail right

#What is the median gene width?
median(width(ghs))
## [1] 20115.5
res = findOverlaps(HepG2,GM12878)
erbs = HepG2[queryHits(res)]
erbs = granges(erbs)
#erbs2= intersect(HepG2,GM12878)
erbs2 = HepG2[GM12878%in%HepG2]
?intersect
## Help on topic 'intersect' was found in the following packages:
## 
##   Package               Library
##   BiocGenerics          /usr/local/lib/R/site-library
##   dplyr                 /usr/local/lib/R/site-library
##   base                  /usr/lib/R/library
## 
## 
## Using the first match ...
?sort
## Help on topic 'sort' was found in the following packages:
## 
##   Package               Library
##   BiocGenerics          /usr/local/lib/R/site-library
##   base                  /usr/lib/R/library
## 
## 
## Using the first match ...
head(erbs); head(erbs2)
## GRanges object with 6 ranges and 0 metadata columns:
##       seqnames               ranges strand
##          <Rle>            <IRanges>  <Rle>
##   [1]     chr2 [20335378, 20335787]      *
##   [2]    chr20 [  328285,   329145]      *
##   [3]    chr19 [ 1244419,  1245304]      *
##   [4]    chr11 [64071828, 64073069]      *
##   [5]     chr2 [16938364, 16938840]      *
##   [6]     chr3 [53290050, 53290602]      *
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
## GRanges object with 0 ranges and 7 metadata columns:
##    seqnames    ranges strand |      name     score       col signalValue
##       <Rle> <IRanges>  <Rle> | <numeric> <integer> <logical>   <numeric>
##       pValue    qValue      peak
##    <numeric> <numeric> <integer>
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
length(erbs[erbs %over% erbs2])
## [1] 0
length(erbs)
## [1] 75
length(erbs2[erbs %over% erbs2])
## [1] 0
length(erbs2)
## [1] 0
## first order them
erbs3 = erbs[order(erbs),]
##confirm same chr
all( seqnames(erbs2)==seqnames(erbs3) )
## [1] TRUE
mean( start(erbs2)==start(erbs3) & end(erbs2)==end(erbs3) )
## [1] NaN
##the intersection should be smaller
all( width(erbs2) <= width(erbs3) )
## [1] TRUE
#Using the ghs regions:
library(Homo.sapiens)
ghs = genes(Homo.sapiens)
#and what you learned in the video, convert the ghs object to one that represents just the tss
#What is the TSS (Transcription Start Site) of the gene with ID: 100113402?
#Hint: look at the ghs in the console. Note that the names of the ranges are the same as the GENEID column. So you can index the ranges directly with "100113402"
tssgr= resize(ghs,1)
start(tssgr["100113402"])
## [1] 70563402
library(ERBS)
data(HepG2)
data(GM12878)
res = findOverlaps(HepG2,GM12878)
erbs = HepG2[queryHits(res)]
erbs = granges(erbs)

#What is the GENEID of the gene with TSS closest to the 4th region of erbs?
ghs = genes(Homo.sapiens)
tss<-resize(ghs,1)
d<-nearest( erbs[4], tss)
mcols(tss)$GENEID[d]
## FactorList of length 1
## [["2101"]] 2101
#what is symbol of gene?
#select()
head(mcols(tss))
## DataFrame with 6 rows and 1 column
##         GENEID
##   <FactorList>
## 1            1
## 2           10
## 3          100
## 4         1000
## 5        10000
## 6    100008586
gene=as.character(mcols(tss)$GENEID[d])
?as.character
#select(Homo.sapiens,key=gene,column="SYMBOL",keytype="GENEID")

gene region sequences

data(HepG2)
data(GM12878)
res = findOverlaps(HepG2,GM12878)
erbs = HepG2[queryHits(res)]
erbs = granges(erbs)
head(erbs)
## GRanges object with 6 ranges and 0 metadata columns:
##       seqnames               ranges strand
##          <Rle>            <IRanges>  <Rle>
##   [1]     chr2 [20335378, 20335787]      *
##   [2]    chr20 [  328285,   329145]      *
##   [3]    chr19 [ 1244419,  1245304]      *
##   [4]    chr11 [64071828, 64073069]      *
##   [5]     chr2 [16938364, 16938840]      *
##   [6]     chr3 [53290050, 53290602]      *
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
#Now use the getSeq function to extract the sequence of each region in erbs. Then compute the GC-content (the number of C's + the number of G's divided by the length of sequence) of each.
#Q:What is the median GC-content
seqs=getSeq(Hsapiens,erbs)
showMethods("getSeq")
## Function: getSeq (package Biostrings)
## x="BSgenome"
## x="FaFile"
## x="FaFileList"
## x="TwoBitFile"
showMethods("alphabetFrequency")
## Function: alphabetFrequency (package Biostrings)
## x="AAString"
## x="AAStringSet"
## x="BSgenomeViews"
## x="DNAString"
## x="DNAStringSet"
## x="MaskedXString"
## x="MultipleAlignment"
## x="RNAString"
## x="RNAStringSet"
## x="XString"
## x="XStringSet"
## x="XStringViews"
gc = alphabetFrequency(seqs)[,2:3]
n = width(erbs)
gccontent = rowSums(gc)/n
median(gccontent)
## [1] 0.652568
library(Biostrings)
#browseVignettes(package="Biostrings")
#help(package="BioStrings",help_type="html")

#Now create a control set of regions by shifting erbs by 10000.
#What is the median GC-content of these control regions:

control<-shift(erbs, 10000)
! all(erbs %over% control)
## [1] TRUE
seqs=getSeq(Hsapiens,control)
gc = alphabetFrequency(seqs)[,2:3]
n = width(erbs)
controlgccontent = rowSums(gc)/n
median(controlgccontent)
## [1] 0.4860174
boxplot(gccontent, controlgccontent)

Advanced Annotation

# Ref Genome Discovery
length(available.genomes())
## [1] 78
grep("Drerio", available.genomes(), value=TRUE)
## [1] "BSgenome.Drerio.UCSC.danRer5"       
## [2] "BSgenome.Drerio.UCSC.danRer5.masked"
## [3] "BSgenome.Drerio.UCSC.danRer6"       
## [4] "BSgenome.Drerio.UCSC.danRer6.masked"
## [5] "BSgenome.Drerio.UCSC.danRer7"       
## [6] "BSgenome.Drerio.UCSC.danRer7.masked"
#grep("Cerv*", available.genomes(), value=TRUE)

# Masking
#We have noted that the reference genome builds for complex organisms are works in progress. Genomic sequence "mask" structures have been defined to isolate ambiguous, unmappable, and low-complexity segments of genomes so that sequence analysis research can be targeted to reflect current knowledge of sequence regions that are more likely to be functionally informative.
c17m = BSgenome.Hsapiens.UCSC.hg19.masked$chr17
class(c17m)
## [1] "MaskedDNAString"
## attr(,"package")
## [1] "Biostrings"
library(Biostrings)
#mcols(c17m)
#mcols(BSgenome.Hsapiens.UCSC.hg19.masked$chr17)

# Quantifying Assembly Gaps
# In build hg19, what percentage of the length of chromosome 22 is occupied by "assembly gaps"? Reply with an integer between 0 and 100.
c22m = BSgenome.Hsapiens.UCSC.hg19.masked$chr22
round(100*sum(width(masks(c22m)$AGAPS))/length(c22m),0)
## [1] 32
help(BSgenome.Hsapiens.UCSC.hg19.masked)

liftover ref builds

#download.file("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz",
#  "hg19ToHg38.over.chain.gz")
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.0 (2015-02-19) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.19.0 (2015-02-27) successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## 
## The following objects are masked from 'package:devtools':
## 
##     check, unload
## 
## The following object is masked from 'package:GenomicRanges':
## 
##     trim
## 
## The following object is masked from 'package:IRanges':
## 
##     trim
## 
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## 
## The following objects are masked from 'package:base':
## 
##     attach, detach, gc, load, save
## 
## R.utils v2.1.0 (2015-05-27) successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## 
## The following object is masked from 'package:utils':
## 
##     timestamp
## 
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, parse, warnings
# gunzip("hg19ToHg38.over.chain.gz")

library(ERBS)
data(HepG2)
library(rtracklayer)
ch = import.chain("hg19ToHg38.over.chain") 
nHepG2 = liftOver(HepG2, ch)

# How far, in bases, has the first range in HepG2 (chr2, 20335378 is starting address) moved upstream in the hg38 build?:
start(HepG2[1])-start(nHepG2[1])
## IntegerList of length 1
## [["1"]] 199761

exons, introns, transcripts

# parsing gene model
# ?modPlot
# modPlot("ESR1", useGeneSym=FALSE, collapse=FALSE)

# enumerate transcript
length(transcriptsBy(Homo.sapiens, by="gene")$"2099")
## [1] 27
library(ReportingTools)
## Loading required package: knitr
## 
## 
## Attaching package: 'ReportingTools'
## 
## The following object is masked from 'package:rtracklayer':
## 
##     path
# BED files
library(rtracklayer)
data(targets)
class(targets)
## [1] "data.frame"
head(targets) #no version info, have to add columns
##                  name          target chrom     start       end strand
## 555774     hsa-miR-16 ENST00000000412 chr12   8985197   8985217      -
## 415091 hsa-miR-509-3p ENST00000003084  chr7 117095440 117095461      +
## 594550    hsa-miR-612 ENST00000003834 chr17  23750064  23750088      +
## 398678 hsa-miR-423-3p ENST00000006015  chr7  27187935  27187957      -
## 607152   hsa-miR-125b ENST00000006101 chr17  43458623  43458643      -
## 608640 hsa-miR-324-3p ENST00000006658 chr17  45988032  45988051      +
# Q 2.12.3 GRanges to bed
# We can create a GRanges instance from the targets data frame as follows
library(GenomicRanges)
mtar = with(targets,
    GRanges(chrom, IRanges(start,end), strand=strand,
  targets=target, mirname=name))

# You can glimpse of exported versions of this data with
cat(export(mtar[1:5], format="bed"), sep="\n")
## chr12    8985196 8985217 .   0   -
## chr7 117095439   117095461   .   0   +
## chr17    23750063    23750088    .   0   +
## chr7 27187934    27187957    .   0   -
## chr17    43458622    43458643    .   0   -
cat("\n")
cat(export(mtar[1:5], format="gff3"), sep="\n")
## ##gff-version 3
## ##source-version rtracklayer 1.28.10
## ##date 2015-10-20
## chr12    rtracklayer sequence_feature    8985197 8985217 .   -   .   targets=ENST00000000412;mirname=hsa-miR-16
## chr7 rtracklayer sequence_feature    117095440   117095461   .   +   .   targets=ENST00000003084;mirname=hsa-miR-509-3p
## chr17    rtracklayer sequence_feature    23750064    23750088    .   +   .   targets=ENST00000003834;mirname=hsa-miR-612
## chr7 rtracklayer sequence_feature    27187935    27187957    .   -   .   targets=ENST00000006015;mirname=hsa-miR-423-3p
## chr17    rtracklayer sequence_feature    43458623    43458643    .   -   .   targets=ENST00000006101;mirname=hsa-miR-125b
# How can metadata about the data origin and reference build be encoded in the bed export?
# descriptive file-names
mtar = with(targets,
    GRanges(chrom, IRanges(start,end), strand=strand,
  targets=target, mirname=name))

# You can glimpse of exported versions of this data with
cat(export(mtar[1:5], format="bed"), sep="\n")
## chr12    8985196 8985217 .   0   -
## chr7 117095439   117095461   .   0   +
## chr17    23750063    23750088    .   0   +
## chr7 27187934    27187957    .   0   -
## chr17    43458622    43458643    .   0   -
cat("\n")
cat(export(mtar[1:5], format="gff3"), sep="\n")
## ##gff-version 3
## ##source-version rtracklayer 1.28.10
## ##date 2015-10-20
## chr12    rtracklayer sequence_feature    8985197 8985217 .   -   .   targets=ENST00000000412;mirname=hsa-miR-16
## chr7 rtracklayer sequence_feature    117095440   117095461   .   +   .   targets=ENST00000003084;mirname=hsa-miR-509-3p
## chr17    rtracklayer sequence_feature    23750064    23750088    .   +   .   targets=ENST00000003834;mirname=hsa-miR-612
## chr7 rtracklayer sequence_feature    27187935    27187957    .   -   .   targets=ENST00000006015;mirname=hsa-miR-423-3p
## chr17    rtracklayer sequence_feature    43458623    43458643    .   -   .   targets=ENST00000006101;mirname=hsa-miR-125b

enumerate genes

# generate addresses of all genes for Homo sapiens, and then acquire the ER binding sites in liver cells.
library(Homo.sapiens)
g = genes(Homo.sapiens)
library(ERBS)
data(HepG2)

# Interpret the following computation:
kp = g[resize(g,1) %over% HepG2] 
# => tss
#resize(g,1) is a strand-respecting reduction of the gene interval to a single base representing the transcription start site


# The ReportingTools package provides very nice interactive tabulation. We'll generate a data.frame and publish it to the browser.
#library(IRanges)
#library(GenomicRanges)
#library(ReportingTools)
#nn = names(kp)
#m = select(Homo.sapiens, keys=nn, keytype="ENTREZID", columns=c("SYMBOL", "GENENAME", "TERM", "GO"))
#hrep = HTMLReport(shortName="erhep.html")
#publish(m, hrep)
#finish(hrep)

microarray technology

NGS

organize bioconductor architecture

import ma data

NGS

browser/d3.js

#source("http://bioconductor.org/biocLite.R")
#biocLite()
#source("https://bioconductor.org/biocLite.R")
#biocLite("epivizr")
#browseVignettes("genomes")