RNA-seq R语言部分(表达矩阵合并及id转换)

1.读取各个样本的矩阵数据合并

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
options(stringsAsFactors = FALSE)#用read.table和其衍生品比如read.csv read.delim,R会自动把字符串string的列辨认读成factor
library("biomaRt")
library("tidyverse")
expr_SRR957678 <- read.table('SRR957678_count',col.names = c("gene_id","control2"))#col.names设置列名
expr_SRR957677 <- read.table('SRR957677_count',col.names = c("gene_id","control1"))
expr_SRR957679 <- read.table('SRR957679_count',col.names = c("gene_id","treat1"))
expr_SRR957680 <- read.table('SRR957680_count',col.names =c("gene_id","treat2"))
> head(expr_SRR957677)
gene_id control1
1 ENSG00000000003.10 796
2 ENSG00000000005.5 0
3 ENSG00000000419.8 389
4 ENSG00000000457.9 288
5 ENSG00000000460.12 506
6 ENSG00000000938.8 0
> head(expr_SRR957680)
gene_id treat2
1 ENSG00000000003.10 937
2 ENSG00000000005.5 1
3 ENSG00000000419.8 503
4 ENSG00000000457.9 283
5 ENSG00000000460.12 543
6 ENSG00000000938.8 0
#先对照和实验组分别合并到一个data.frame再合并为一个总的数据框
control <- merge(expr_SRR957677,expr_SRR957678,by="gene_id")#by="gene_id"按照gene_id这列合并
> head(control)
gene_id control1 control2
1 __alignment_not_unique 2454101 977633
2 __ambiguous 232830 94141
3 __no_feature 9207514 3905808
4 __not_aligned 1147541 533575
5 __too_low_aQual 0 0
6 ENSG00000000003.10 796 349
treat <- merge(expr_SRR957679,expr_SRR957680,by="gene_id")
> head(treat)
gene_id treat1 treat2
1 __alignment_not_unique 2583435 2848159
2 __ambiguous 255370 283449
3 __no_feature 8353992 10560422
4 __not_aligned 1204646 1287901
5 __too_low_aQual 0 0
6 ENSG00000000003.10 780 937
raw_count <- merge(control,treat,by="gene_id")
> head(raw_count)
gene_id control1 control2 treat1 treat2
1 __alignment_not_unique 2454101 977633 2583435 2848159
2 __ambiguous 232830 94141 255370 283449
3 __no_feature 9207514 3905808 8353992 10560422
4 __not_aligned 1147541 533575 1204646 1287901
5 __too_low_aQual 0 0 0 0
6 ENSG00000000003.10 796 349 780 937
raw_count <-raw_count[-1:-5,]#删掉前五行,由上一步结果可以看出前五行gene_id有问题
> head(raw_count)
gene_id control1 control2 treat1 treat2
6 ENSG00000000003.10 796 349 780 937
7 ENSG00000000005.5 0 0 0 1
8 ENSG00000000419.8 389 174 403 503
9 ENSG00000000457.9 288 108 218 283
10 ENSG00000000460.12 506 208 451 543
11 ENSG00000000938.8 0 0 0 0

2.去除gene_id名字中的小数点

EBI数据库中无法搜到ENSG00000000003.10这样的基因,只能是整数ENSG00000000003才能进行id转换

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
library(stringr)
ENSEMBL <- raw_count$gene_id
> head(ENSEMBL)
[1] "ENSG00000000003.10" "ENSG00000000005.5" "ENSG00000000419.8"
[4] "ENSG00000000457.9" "ENSG00000000460.12" "ENSG00000000938.8"
ENSEMBL <- str_split_fixed(ENSEMBL,'[.]',2)[,1]#将小数点及后面的数字去掉,ensembl_gene_id是整数
> head(ENSEMBL)
[1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419"
[4] "ENSG00000000457" "ENSG00000000460" "ENSG00000000938"
rownames(raw_count) <- ENSEMBL#将去除小数点后的ensembl_gene_id作为行名后期方便绘图
raw_count$ensembl_gene_id <- ENSEMBL#新建ensembl_gene_id列便于合并
> head(raw_count)
gene_id control1 control2 treat1 treat2
ENSG00000000003 ENSG00000000003.10 796 349 780 937
ENSG00000000005 ENSG00000000005.5 0 0 0 1
ENSG00000000419 ENSG00000000419.8 389 174 403 503
ENSG00000000457 ENSG00000000457.9 288 108 218 283
ENSG00000000460 ENSG00000000460.12 506 208 451 543
ENSG00000000938 ENSG00000000938.8 0 0 0 0
ensembl_gene_id
ENSG00000000003 ENSG00000000003
ENSG00000000005 ENSG00000000005
ENSG00000000419 ENSG00000000419
ENSG00000000457 ENSG00000000457
ENSG00000000460 ENSG00000000460
ENSG00000000938 ENSG00000000938

3.对基因进行注释,获取gene_symbol

bioMart包是一个连接bioMart数据库的R语言接口,能通过这个软件包自由连接到bioMart数据库
这个·包可以做以下几个工作
1.查找某个基因在染色体上的位置。反之,给定染色体每一区间,返回该区间的基因s;
2.通过EntrezGene的ID查找到相关序列的GO注释。反之,给定相关的GO注释,获取相关的EntrezGene的ID;
3.通过EntrezGene的ID查找到相关序列的上游100bp序列(可能包含启动子等调控元件);
4.查找人类染色体上每一段区域中已知的SNPs;
5.给定一组的序列ID,获得其中具体的序列;

1
2
3
4
5
6
7
8
9
#显示一下能连接的数据库
listMarts()
> listMarts()
biomart version
1 ENSEMBL_MART_ENSEMBL Ensembl Genes 101
2 ENSEMBL_MART_MOUSE Mouse strains 101
3 ENSEMBL_MART_SNP Ensembl Variation 101
4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 101

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#用useMart函数选定数据库
plant<-useMart("ensembl")
#用listDatasets()函数显示当前数据库所含的基因组注释,我们要获取的基因注释的基因是人类基因,所以选择hsapiens_gene_ensembl
> listDatasets(plant)
dataset
1 acalliptera_gene_ensembl
2 acarolinensis_gene_ensembl
3 acitrinellus_gene_ensembl
4 amelanoleuca_gene_ensembl
5 amexicanus_gene_ensembl
6 anancymaae_gene_ensembl
7 aocellaris_gene_ensembl
···
···
54 hsapiens_gene_ensembl

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#选定ensembl数据库中的hsapiens_gene_ensembl基因组
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
#选定我们需要获得的注释类型
#用lsitFilters()函数查看可选择的类型,选定要获取的注释类型,以及已知注释的类型用lsitFilters()函数查看可选择的类型,选定要获取的注释类型,以及已知注释的类型
> listFilters(mart)
name
1 chromosome_name
2 start
3 end
4 band_start
5 band_end
6 marker_start
7 marker_end
8 encode_region
9 strand
10 chromosomal_region
11 with_ccds
12 with_chembl
13 with_clone_based_ensembl_gene
14 with_clone_based_ensembl_transcript
15 with_dbass3
16 with_dbass5
17 with_entrezgene_trans_name
18 with_embl
19 with_arrayexpress
20 with_genedb
21 with_go
#这里我们选择这些要获得数值的类型ensembl_gene_id ,hgnc_symbol chromosome_name start_position end_position band我们已知的类型是ensembl_gene_id选择好数据库,基因组,要获得的注释类型,和已知的注释类型,就可以开始获取注释了
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#用getBM()函数获取注释
hg_symbols<- getBM(attributes=c('ensembl_gene_id','hgnc_symbol',"chromosome_name", "start_position","end_position", "band"), filters= 'ensembl_gene_id', values = ENSEMBL, mart = mart)
#attributers()里面的值为我们要获取的注释类型ensembl_gene_id ,hgnc_symbol chromosome_name start_position end_position band
#filters()里面的值为我们已知的注释类型ensembl_gene_id
#values= 这个值就是我们已知的注释类型的数据,把上面我们通过数据处理得到的ensembl基因序号作为ensembl_gene_id 的值
#mart= 这个值是我们所选定的数据库的基因组
> head(hg_symbols)
ensembl_gene_id hgnc_symbol chromosome_name start_position
1 ENSG00000000003 TSPAN6 X 100627108
2 ENSG00000000005 TNMD X 100584936
3 ENSG00000000419 DPM1 20 50934867
4 ENSG00000000457 SCYL3 1 169849631
5 ENSG00000000460 C1orf112 1 169662007
6 ENSG00000000938 FGR 1 27612064
end_position band
1 100639991 q22.1
2 100599885 q22.1
3 50958555 q13.13
4 169894267 q24.2
5 169854080 q24.2
6 27635185 p35.3
save(mart, file = "mart.RData")#将mart变量保存为mart.RData
save(hg_symbols, file = "hg_symbols.RData")#将hg_symbols变量保存为hg_symbols,biomart包不太稳定,有时连不上

4.将合并后的表达数据框raw_count与注释得到的hg_symbols整合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
read_count <- merge(raw_count,hg_symbols,by="ensembl_gene_id")#将raw_count与注释后得到的hg_symbols合并
> head(read_count)
ensembl_gene_id gene_id control1 control2 treat1 treat2
1 ENSG00000000003 ENSG00000000003.10 796 349 780 937
2 ENSG00000000005 ENSG00000000005.5 0 0 0 1
3 ENSG00000000419 ENSG00000000419.8 389 174 403 503
4 ENSG00000000457 ENSG00000000457.9 288 108 218 283
5 ENSG00000000460 ENSG00000000460.12 506 208 451 543
6 ENSG00000000938 ENSG00000000938.8 0 0 0 0
hgnc_symbol chromosome_name start_position end_position band
1 TSPAN6 X 100627108 100639991 q22.1
2 TNMD X 100584936 100599885 q22.1
3 DPM1 20 50934867 50958555 q13.13
4 SCYL3 1 169849631 169894267 q24.2
5 C1orf112 1 169662007 169854080 q24.2
6 FGR 1 27612064 27635185 p35.3
write.csv(read_count,file = "readcount_all.csv")
readcount <- raw_count[,c(-1,-6)]#删除最后一列ensembl_gene_id和以及第一列gene_id
> head(readcount)
control1 control2 treat1 treat2
ENSG00000000003 796 349 780 937
ENSG00000000005 0 0 0 1
ENSG00000000419 389 174 403 503
ENSG00000000457 288 108 218 283
ENSG00000000460 506 208 451 543
ENSG00000000938 0 0 0 0
write.csv(readcount,file = "readcount.csv")

引用

1.RNA-seq练习 第二部分(基因组序列下载,注释文件下载,索引下载,比对,比对质控,HTseq-count计数,输出count矩阵文件)

2.实例讲述bioMart包的用法

客官打个赏咯.