In [1]:
library(limma)
library(edgeR)
Warning message:
“package ‘limma’ was built under R version 3.4.2”

Read the count table

In [2]:
cc <- read.csv("counts.csv")

Do the table formatting

In [3]:
rownames(cc) <- cc[,2]
cc <- cc[, -c(1,2)]
o <- order(colnames(cc))
cc <- cc[, o]
ccl <- apply(cc, c(1,2), function(x) log2(x+1))

Adjust the column names

In [4]:
tmp <- colnames(cc)
colnames(cc) <- sapply(tmp, function(x) substr(x, 14, 22))
colnames(ccl) <- sapply(tmp, function(x) substr(x, 14, 22))
In [5]:
head(cc)
mfc_cnt_1mfc_cnt_2mfc_cnt_3mfc_dac_1mfc_dac_2mfc_dac_3yb5_cnt_1yb5_cnt_2yb5_cnt_3yb5_dac_1yb5_dac_2yb5_dac_3
ENSG00000223972 11 18 55 46 57 56 59 35 32 278 203 215
ENSG00000227232208924417559364441724329405525682568680349575629
ENSG00000278267 0 0 0 0 0 0 0 0 0 0 0 0
ENSG00000243485 2 3 8 8 11 13 12 7 9 33 26 23
ENSG00000237613 0 0 0 0 0 0 1 0 0 2 1 0
ENSG00000268020 0 0 0 0 0 0 0 0 1 0 0 0
In [6]:
head(ccl)
mfc_cnt_1mfc_cnt_2mfc_cnt_3mfc_dac_1mfc_dac_2mfc_dac_3yb5_cnt_1yb5_cnt_2yb5_cnt_3yb5_dac_1yb5_dac_2yb5_dac_3
ENSG00000223972 3.584963 4.247928 5.807355 5.554589 5.857981 5.832890 5.906891 5.169925 5.044394 8.124121 7.672425 7.754888
ENSG0000022723211.02928711.25384712.88417111.83170312.02686912.08015111.98584211.32699111.32699112.73216712.27554312.458919
ENSG00000278267 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
ENSG00000243485 1.584963 2.000000 3.169925 3.169925 3.584963 3.807355 3.700440 3.000000 3.321928 5.087463 4.754888 4.584963
ENSG00000237613 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.584963 1.000000 0.000000
ENSG00000268020 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000
In [7]:
boxplot(ccl)

Read the design/metadata

In [8]:
meta <- read.table("metadata.txt", sep="\t", header=T)
rownames(meta) <- as.character(meta[,1])
o <- order(rownames(meta))
meta <- meta[o, ]
meta
FileNamecellinetreatment
mfc_cnt_1mfc_cnt_1mfc cnt
mfc_cnt_2mfc_cnt_2mfc cnt
mfc_cnt_3mfc_cnt_3mfc cnt
mfc_dac_1mfc_dac_1mfc dac
mfc_dac_2mfc_dac_2mfc dac
mfc_dac_3mfc_dac_3mfc dac
yb5_cnt_1yb5_cnt_1yb5 cnt
yb5_cnt_2yb5_cnt_2yb5 cnt
yb5_cnt_3yb5_cnt_3yb5 cnt
yb5_dac_1yb5_dac_1yb5 cnt
yb5_dac_2yb5_dac_2yb5 cnt
yb5_dac_3yb5_dac_3yb5 cnt
In [9]:
targets <- meta[, c("celline", "treatment")]
targets <- as.data.frame(targets)
In [10]:
targets
cellinetreatment
mfc_cnt_1mfccnt
mfc_cnt_2mfccnt
mfc_cnt_3mfccnt
mfc_dac_1mfcdac
mfc_dac_2mfcdac
mfc_dac_3mfcdac
yb5_cnt_1yb5cnt
yb5_cnt_2yb5cnt
yb5_cnt_3yb5cnt
yb5_dac_1yb5cnt
yb5_dac_2yb5cnt
yb5_dac_3yb5cnt

Create design object

In [11]:
Treat <- factor(paste(targets$celline,targets$treatment,sep="."))
design <- model.matrix(~0+Treat)
colnames(design) <- levels(Treat)
In [12]:
design
mfc.cntmfc.dacyb5.cnt
1100
2100
3100
4010
5010
6010
7001
8001
9001
10001
11001
12001

Do the dispesion estimate and model fitting

In [13]:
y <- DGEList(counts=cc)
y <- estimateDisp(y, design)
fit <- glmFit(y, design)

Set up the biological question with a contrast

In [14]:
cm <- makeContrasts( mfc.cnt-mfc.dac, levels=design)
cm
mfc.cnt - mfc.dac
mfc.cnt 1
mfc.dac-1
yb5.cnt 0

Do the model fit and get top genes

In [15]:
lrt <- glmLRT(fit, contrast=cm)
topOnes <- topTags(lrt, n=dim(cc)[1])$table
In [16]:
head(topOnes)
logFClogCPMLRPValueFDR
ENSG00000233276-5.703099 5.916299 243.9207 5.493388e-553.185726e-50
ENSG00000141756-2.501588 6.007440 193.9030 4.471368e-441.296518e-39
ENSG00000226383-5.534574 3.531377 173.8707 1.056378e-392.042050e-35
ENSG00000164850-2.078655 5.068626 173.1630 1.507899e-392.186152e-35
ENSG00000084207-4.253757 8.232289 170.8907 4.727468e-395.483106e-35
ENSG00000163362-2.521056 5.212185 151.6937 7.392005e-357.144619e-31

Simple pairwise comparison

In [17]:
cc2 <- cc[,1:6]
group <- factor(c(1,1,1,2,2,2))
y <- DGEList(counts=cc2,group=group)
y <- calcNormFactors(y)
design <- model.matrix(~group)
y <- estimateDisp(y,design)

fit <- glmFit(y,design)
lrt <- glmLRT(fit)
topOnes_pairwise <- topTags(lrt, n=dim(cc2)[1])$table
In [18]:
head(topOnes_pairwise)
logFClogCPMLRPValueFDR
ENSG00000130600 5.505566 9.008124 1041.4478 1.758697e-2281.019903e-223
ENSG00000135480 4.606337 5.098251 792.1216 2.786028e-1748.078366e-170
ENSG00000070985 6.610261 2.469754 625.4816 4.803161e-1389.284831e-134
ENSG00000147174 3.857756 3.692567 606.1356 7.749302e-1341.123494e-129
ENSG00000105974-2.766143 5.721369 605.2399 1.213603e-1331.407585e-129
ENSG00000198883 7.718994 2.154230 589.9396 2.582475e-1302.496048e-126
In [19]:
length(intersect(rownames(topOnes)[1:500], rownames(topOnes_pairwise)[1:500]))
207

DESeq2 way

In [26]:
library(DESeq2)
In [21]:
cc2 <- cc[,1:6]
conds <- c("cnt","cnt","cnt",  "dac", "dac", "dac")
colData<-data.frame(condition=as.factor(conds))
dds <- DESeqDataSetFromMatrix(countData = cc2,colData = colData,design = formula(colData) ) 
dds <- DESeq(dds)
res <- results(dds)
o <- order(res[,"pvalue"])
res <- res[o, ]
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing

DESeq2 and edgeR agree quite a lot

In [22]:
length(intersect(rownames(res)[1:500], rownames(topOnes_pairwise)[1:500]))
385
In [25]:
library(Vennerable)
In [24]:
i1 <- rownames(res)[1:500]
i2 <- rownames(topOnes_pairwise)[1:500]
i3 <- rownames(topOnes)[1:500]

vennD <- Venn ( list(i1, i2, i3), SetNames = c("DESeq2", "edgeR_pairwise","edgeR_glm"))
plot(vennD, doWeights = T, type = "circles")
In [ ]: