您正在寻找的东西通常是通过Bioconductor上出色的GenomicRanges软件包来完成的。您可以从区域创建GenomicRanges
对象,并使用findOverlaps()
函数查找它们重叠的位置。要控制什么是重叠,请参见findOverlaps(...,type="")
中的“ type”参数。这是R中的一个示例。我敢肯定,这样做的方式更优雅,但这就是我在午餐时想得到的:
# install.packages('readr')
library(readr)
# install.packages('BiocManager')
# BiocManager::install('GenomicRanges')
library(GenomicRanges)
# Read data into R. Needs to have column names "chr","start" and "end"
x <- read_csv('~/Downloads/regions.csv')
y <- read_csv('~/Downloads/genes.csv')
# Set up two GenomicRanges objects
gr_x <- makeGRangesFromDataFrame(x,keep.extra.columns = TRUE)
gr_y <- makeGRangesFromDataFrame(y,keep.extra.columns = TRUE)
# Overlap the regions containing genes with the trait data
ovl <- findOverlaps(gr_y,gr_x,type = "any",select = "all",ignore.strand = TRUE)
# Group hits by trait regions
hits_by_region <- split(from(ovl),to(ovl))
# Create a data.frame that matches a trait region index to a string of genes
hits_df <- do.call(rbind,lapply(seq_along(hits_by_region),function(f) {
idx <- as.integer(names(hits_by_region[f]))
genes <- gr_y$Gene[hits_by_region[[f]]]
data.frame(Index = idx,Genes = paste(genes,collapse=','),stringsAsFactors = FALSE)
}))
# Add the string of genes as metadata to the GRanges object
gr_x$Genes <- ""
gr_x$Genes[hits_df$Index] <- hits_df$Genes
# Convert back to data.frame (if needed)
genes_by_regio_df <- as.data.frame(gr_x)
本文链接:https://www.f2er.com/3165845.html