class DNA
attr_accessor :sequencing
def initialize(sequencing)
@sequencing = sequencing
end
def kmers(k)
@sequencing.each_char.each_cons(k).map(&:join)
end
def shared_kmers(k,dna)
kmers(k).each_with_object([]).with_index do |(kmer,result),index|
dna.kmers(k).each_with_index do |other_kmer,other_kmer_index|
result << [index,other_kmer_index] if kmer.eql?(other_kmer)
end
end
end
end
dna1 = DNA.new('GCCCAC')
dna2 = DNA.new('CCACGC')
dna1.kmers(2)
#=> ["GC","CC","CA","AC"]
dna2.kmers(2)
#=> ["CC","AC","CG","GC"]
dna1.shared_kmers(2,dna2)
#=> [[0,4],[1,0],[2,[3,1],[4,2]]
dna2.shared_kmers(2,dna1)
#=> [[0,[0,2],3],0]]
dna1.shared_kmers(3,dna2)
#=> [[2,1]]
dna1.shared_kmers(4,0]]
dna1.shared_kmers(5,dna2)
#=> []
,
我将只解决您的问题的症结,而不涉及类DNA
。重新整理后续内容应该很容易。
代码
def match_kmers(s1,s2,k)
h1 = dna_to_index(s1,k)
h2 = dna_to_index(s2,k)
h1.flat_map { |k,_| h1[k].product(h2[k] || []) }
end
def dna_to_index(dna,k)
dna.each_char.
with_index.
each_cons(k).
with_object({}) {|arr,h| (h[arr.map(&:first).join] ||= []) << arr.first.last}
end
示例
dna1 = 'GCCCAC'
dna2 = 'CCACGC'
match_kmers(dna1,dna2,2)
#=> [[0,2]]
match_kmers(dna2,dna1,0]]
match_kmers(dna1,3)
#=> [[2,1]]
match_kmers(dna2,3)
#=> [[0,3]]
match_kmers(dna1,4)
#=> [[2,0]]
match_kmers(dna2,4)
#=> [[0,2]]
match_kmers(dna1,5)
#=> []
match_kmers(dna2,5)
#=> []
match_kmers(dna1,6)
#=> []
match_kmers(dna2,6)
#=> []
说明
考虑dna1 = 'GCCCAC'
。这包含5个2聚体(k = 2
):
dna1.each_char.each_cons(2).to_a.map(&:join)
#=> ["GC","AC"]
类似地,对于dna2 = 'CCACGC'
:
dna2.each_char.each_cons(2).to_a.map(&:join)
#=> ["CC","GC"]
这些分别是dna_to_index
分别为dna1
和dna2
产生的哈希键。哈希值是相应键在DNA字符串中的起始位置的索引数组。让我们计算k = 2
的哈希值:
h1 = dna_to_index(dna1,2)
#=> {"GC"=>[0],"CC"=>[1,"CA"=>[3],"AC"=>[4]}
h2 = dna_to_index(dna2,2)
#=> {"CC"=>[0],"CA"=>[1],"AC"=>[2],"CG"=>[3],"GC"=>[4]}
h1
显示:
-
"GC"
从dna1
的索引0开始
-
"CC"
从dna1
的索引1和2开始
-
"CA"
从dna1
的索引3开始
-
"CC"
从dna1
的索引4开始
h2
具有类似的解释。参见Enumerable#flat_map和Array#product。
然后使用方法match_kmers
来构建索引对[i,j]
的期望数组,使得h1[i] = h2[j]
。
现在让我们来看一下为3-mers(k = 3
)生成的散列值:
h1 = dna_to_index(dna1,3)
#=> {"GCC"=>[0],"CCC"=>[1],"CCA"=>[2],"CAC"=>[3]}
h2 = dna_to_index(dna2,3)
#=> {"CCA"=>[0],"CAC"=>[1],"ACG"=>[2],"CGC"=>[3]}
我们看到dna1
中的第一个3聚体是"GCC"
,从索引0开始。这个3聚体没有出现在dna2
中,因此没有元素返回的数组中的[0,X]
(X
只是一个占位符)。 "CCC"
也不是第二个哈希中的键。 "CCA"
和"CAC"
存在于第二个哈希中,因此返回的数组为:
h1["CCA"].product(h2["CCA"]) + h1["CAC"].product(h2["CAC"])
#=> [[2,0]] + [[3,1]]
#=> [[2,1]]
,
我将首先编写一种方法来枚举给定长度(即k-mers)的子序列:
class DNA
def initialize(sequence)
@sequence = sequence
end
def each_kmer(length)
return enum_for(:each_kmer,length) unless block_given?
0.upto(@sequence.length - length) { |i| yield @sequence[i,length] }
end
end
DNA.new('GCCCAC').each_kmer(2).to_a
#=> ["GC","AC"]
除此之外,您还可以使用嵌套循环轻松收集相同k-mers的索引:
class DNA
# ...
def shared_kmers(length,other)
indices = []
each_kmer(length).with_index do |k,i|
other.each_kmer(length).with_index do |l,j|
indices << [i,j] if k == l
end
end
indices
end
end
dna1 = DNA.new('GCCCAC')
dna2 = DNA.new('CCACGC')
dna1.shared_kmers(2,2]]
不幸的是,以上代码对接收器中的每个k-mer遍历other.each_kmer
。我们可以通过在other
中预先建立包含每个k-mer的所有索引的哈希值来优化此操作:
class DNA
# ...
def shared_kmers(length,other)
hash = Hash.new { |h,k| h[k] = [] }
other.each_kmer(length).with_index { |k,i| hash[k] << i }
indices = []
each_kmer(length).with_index do |k,i|
hash[k].each { |j| indices << [i,j] }
end
indices
end
end