1.读取各个样本的矩阵数据合并
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
| options(stringsAsFactors = FALSE) library("biomaRt") library("tidyverse") expr_SRR957678 <- read.table('SRR957678_count',col.names = c("gene_id","control2")) expr_SRR957677 <- read.table('SRR957677_count',col.names = c("gene_id","control1")) expr_SRR957679 <- read.table('SRR957679_count',col.names = c("gene_id","treat1")) expr_SRR957680 <- read.table('SRR957680_count',col.names =c("gene_id","treat2")) > head(expr_SRR957677) gene_id control1 1 ENSG00000000003.10 796 2 ENSG00000000005.5 0 3 ENSG00000000419.8 389 4 ENSG00000000457.9 288 5 ENSG00000000460.12 506 6 ENSG00000000938.8 0 > head(expr_SRR957680) gene_id treat2 1 ENSG00000000003.10 937 2 ENSG00000000005.5 1 3 ENSG00000000419.8 503 4 ENSG00000000457.9 283 5 ENSG00000000460.12 543 6 ENSG00000000938.8 0
control <- merge(expr_SRR957677,expr_SRR957678,by="gene_id") > head(control) gene_id control1 control2 1 __alignment_not_unique 2454101 977633 2 __ambiguous 232830 94141 3 __no_feature 9207514 3905808 4 __not_aligned 1147541 533575 5 __too_low_aQual 0 0 6 ENSG00000000003.10 796 349 treat <- merge(expr_SRR957679,expr_SRR957680,by="gene_id") > head(treat) gene_id treat1 treat2 1 __alignment_not_unique 2583435 2848159 2 __ambiguous 255370 283449 3 __no_feature 8353992 10560422 4 __not_aligned 1204646 1287901 5 __too_low_aQual 0 0 6 ENSG00000000003.10 780 937 raw_count <- merge(control,treat,by="gene_id") > head(raw_count) gene_id control1 control2 treat1 treat2 1 __alignment_not_unique 2454101 977633 2583435 2848159 2 __ambiguous 232830 94141 255370 283449 3 __no_feature 9207514 3905808 8353992 10560422 4 __not_aligned 1147541 533575 1204646 1287901 5 __too_low_aQual 0 0 0 0 6 ENSG00000000003.10 796 349 780 937 raw_count <-raw_count[-1:-5,] > head(raw_count) gene_id control1 control2 treat1 treat2 6 ENSG00000000003.10 796 349 780 937 7 ENSG00000000005.5 0 0 0 1 8 ENSG00000000419.8 389 174 403 503 9 ENSG00000000457.9 288 108 218 283 10 ENSG00000000460.12 506 208 451 543 11 ENSG00000000938.8 0 0 0 0
|
2.去除gene_id名字中的小数点
EBI数据库中无法搜到ENSG00000000003.10这样的基因,只能是整数ENSG00000000003才能进行id转换
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
| library(stringr) ENSEMBL <- raw_count$gene_id > head(ENSEMBL) [1] "ENSG00000000003.10" "ENSG00000000005.5" "ENSG00000000419.8" [4] "ENSG00000000457.9" "ENSG00000000460.12" "ENSG00000000938.8" ENSEMBL <- str_split_fixed(ENSEMBL,'[.]',2)[,1] > head(ENSEMBL) [1] "ENSG00000000003" "ENSG00000000005" "ENSG00000000419" [4] "ENSG00000000457" "ENSG00000000460" "ENSG00000000938" rownames(raw_count) <- ENSEMBL raw_count$ensembl_gene_id <- ENSEMBL > head(raw_count) gene_id control1 control2 treat1 treat2 ENSG00000000003 ENSG00000000003.10 796 349 780 937 ENSG00000000005 ENSG00000000005.5 0 0 0 1 ENSG00000000419 ENSG00000000419.8 389 174 403 503 ENSG00000000457 ENSG00000000457.9 288 108 218 283 ENSG00000000460 ENSG00000000460.12 506 208 451 543 ENSG00000000938 ENSG00000000938.8 0 0 0 0 ensembl_gene_id ENSG00000000003 ENSG00000000003 ENSG00000000005 ENSG00000000005 ENSG00000000419 ENSG00000000419 ENSG00000000457 ENSG00000000457 ENSG00000000460 ENSG00000000460 ENSG00000000938 ENSG00000000938
|
3.对基因进行注释,获取gene_symbol
bioMart包是一个连接bioMart数据库的R语言接口,能通过这个软件包自由连接到bioMart数据库
这个·包可以做以下几个工作
1.查找某个基因在染色体上的位置。反之,给定染色体每一区间,返回该区间的基因s;
2.通过EntrezGene的ID查找到相关序列的GO注释。反之,给定相关的GO注释,获取相关的EntrezGene的ID;
3.通过EntrezGene的ID查找到相关序列的上游100bp序列(可能包含启动子等调控元件);
4.查找人类染色体上每一段区域中已知的SNPs;
5.给定一组的序列ID,获得其中具体的序列;
1 2 3 4 5 6 7 8 9
| listMarts() > listMarts() biomart version 1 ENSEMBL_MART_ENSEMBL Ensembl Genes 101 2 ENSEMBL_MART_MOUSE Mouse strains 101 3 ENSEMBL_MART_SNP Ensembl Variation 101 4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 101
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
| plant<-useMart("ensembl")
> listDatasets(plant) dataset 1 acalliptera_gene_ensembl 2 acarolinensis_gene_ensembl 3 acitrinellus_gene_ensembl 4 amelanoleuca_gene_ensembl 5 amexicanus_gene_ensembl 6 anancymaae_gene_ensembl 7 aocellaris_gene_ensembl ··· ··· 54 hsapiens_gene_ensembl
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
| mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
> listFilters(mart) name 1 chromosome_name 2 start 3 end 4 band_start 5 band_end 6 marker_start 7 marker_end 8 encode_region 9 strand 10 chromosomal_region 11 with_ccds 12 with_chembl 13 with_clone_based_ensembl_gene 14 with_clone_based_ensembl_transcript 15 with_dbass3 16 with_dbass5 17 with_entrezgene_trans_name 18 with_embl 19 with_arrayexpress 20 with_genedb 21 with_go
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| hg_symbols<- getBM(attributes=c('ensembl_gene_id','hgnc_symbol',"chromosome_name", "start_position","end_position", "band"), filters= 'ensembl_gene_id', values = ENSEMBL, mart = mart)
> head(hg_symbols) ensembl_gene_id hgnc_symbol chromosome_name start_position 1 ENSG00000000003 TSPAN6 X 100627108 2 ENSG00000000005 TNMD X 100584936 3 ENSG00000000419 DPM1 20 50934867 4 ENSG00000000457 SCYL3 1 169849631 5 ENSG00000000460 C1orf112 1 169662007 6 ENSG00000000938 FGR 1 27612064 end_position band 1 100639991 q22.1 2 100599885 q22.1 3 50958555 q13.13 4 169894267 q24.2 5 169854080 q24.2 6 27635185 p35.3 save(mart, file = "mart.RData") save(hg_symbols, file = "hg_symbols.RData")
|
4.将合并后的表达数据框raw_count与注释得到的hg_symbols整合
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
| read_count <- merge(raw_count,hg_symbols,by="ensembl_gene_id") > head(read_count) ensembl_gene_id gene_id control1 control2 treat1 treat2 1 ENSG00000000003 ENSG00000000003.10 796 349 780 937 2 ENSG00000000005 ENSG00000000005.5 0 0 0 1 3 ENSG00000000419 ENSG00000000419.8 389 174 403 503 4 ENSG00000000457 ENSG00000000457.9 288 108 218 283 5 ENSG00000000460 ENSG00000000460.12 506 208 451 543 6 ENSG00000000938 ENSG00000000938.8 0 0 0 0 hgnc_symbol chromosome_name start_position end_position band 1 TSPAN6 X 100627108 100639991 q22.1 2 TNMD X 100584936 100599885 q22.1 3 DPM1 20 50934867 50958555 q13.13 4 SCYL3 1 169849631 169894267 q24.2 5 C1orf112 1 169662007 169854080 q24.2 6 FGR 1 27612064 27635185 p35.3 write.csv(read_count,file = "readcount_all.csv") readcount <- raw_count[,c(-1,-6)] > head(readcount) control1 control2 treat1 treat2 ENSG00000000003 796 349 780 937 ENSG00000000005 0 0 0 1 ENSG00000000419 389 174 403 503 ENSG00000000457 288 108 218 283 ENSG00000000460 506 208 451 543 ENSG00000000938 0 0 0 0 write.csv(readcount,file = "readcount.csv")
|
引用
1.RNA-seq练习 第二部分(基因组序列下载,注释文件下载,索引下载,比对,比对质控,HTseq-count计数,输出count矩阵文件)
2.实例讲述bioMart包的用法