我想将 file1 列(1,2,4,5)映射到 file2 列(1,2,4,5)。第 5 列可能包含不同顺序的逗号分隔字符 (A,T,G,C)

文件 1

chr1 123896 rs0987522 A T
chr5 678452 rs8733521 G C,A

文件 2

chr1 123896 rs0987522 A T,C,G
chr5 678452 rs8733521 G A,T

输出

chr1 123896 rs0987522 A T*,G
chr5 678452 rs8733521 G C!,A*,T

如果文件 1 的第 4 列与文件 2 的第 4 列(A==>> T、A、G、C)中的任何字符匹配,则文件 2 中的行应按上面给出的输出和文件 1 中的匹配字符打印出来(此处第 1 行 column5) 中的 T 应由“”(T) 表示,文件 1 中的非匹配字符应由“!”表示(C!)。

awk 'NR==FNR{firstfile[$1,$2,$4];next} ($1,$4) in firstfile' File1 file2

我使用过 awk,但仅用于 (1,2,4) 列。请帮助进行第 5 次匹配。 第 5 列逗号分隔的字符在 file1 和 file2 中的顺序可能不同。

nnaxi 回答:我想将 file1 列(1,2,4,5)映射到 file2 列(1,2,4,5)。第 5 列可能包含不同顺序的逗号分隔字符 (A,T,G,C)

您可以使用此 awk 来存储 $5,密钥为 ($1,$2,$4)。在处理 file2 时,它用逗号分割存储的值,并通过附加 * 替换每个逗号分隔的值。如果未找到元素(即 sub 返回 0),则我们通过添加 !, 来添加每个值:

awk 'NR==FNR {
   map[$1,$4] = $5
   next
}
($1,$4) in map {
    n = split(map[$1,$4],a,/,/)
    for (i=1; i<=n; ++i)
       if (sub(a[i],"&*",$5) == 0)
          $5 = a[i] "!," $5
} 1' file1 file2

chr1 123896 rs0987522 A T*,C,G
chr5 678452 rs8733521 G C!,A*,T
,

对于您显示的示例,请尝试遵循 awk 程序。

awk '
FNR==NR{
  arr1[$1,$4]=$5
  next
}
{
  val=""
  delete arr2;delete arr3;delete arr4;delete arr5 
  num1=split(arr1[$1,arr2,",")
  for(i=1;i<=num1;i++){ arr4[arr2[i]] }
  num2=split($NF,arr3,")
}
(($1,$4) in arr1){
  for(i=1;i<=num2;i++){
    val=(val?val ",":"")(arr3[i] in arr4?arr3[i]"*":arr3[i]"!")
    if(arr3[i] in arr4){ arr5[arr3[i]] }
  }
  for(i=1;i<=num1;i++){
    if(!(arr2[i] in arr5)){
      val=val "," arr2[i]
    }
  }
  $5=val
}
1
' Input_file2  Input_file1

说明:为以上添加详细说明。

awk '                                               ##Starting awk program from here.
FNR==NR{                                            ##Checking FNR==NR which will be true when file2 is being read.
  arr1[$1,$4]=$5                                 ##Creating arr1 with index of 1st,2nd and 4th field and value of 5th field.
  next                                              ##next will skip all further statements from here.
}
{
  val=""                                            ##Nullifying val here.
  delete arr2;delete arr3;delete arr4;delete arr5   ##Deleting arrays here.
  num1=split(arr1[$1,")               ##Splitting arr1 with index of $1,$4 here to arr2.
  for(i=1;i<=num1;i++){ arr4[arr2[i]] }             ##Running loop till num1,creating arr4 with value of arr2 index of i here.
  num2=split($NF,")                          ##Splitting current line last field to arr3 with separator of comma here.
}
(($1,$4) in arr1){                               ##Checking if $1,$4 of current line are present in arr1 then do following.
  for(i=1;i<=num2;i++){                             ##Running for loop till num2 here.
    val=(val?val ",":"")(arr3[i] in arr4?arr3[i]"*":arr3[i]"!") ##Creating val which compares values of file1 and file2 is they are common then add * or add ! of file1 current value(one of the 5th field values).
    if(arr3[i] in arr4){ arr5[arr3[i]] }            ##If arr3 value is present in arr4 then create arr5 with index of value of arr3 with index of i.
  }
  for(i=1;i<=num1;i++){                             ##Running loop till value of num1 here.
    if(!(arr2[i] in arr5)){                         ##If value of arr2 is NOT present in arr5(to get values which are already printed common ones of file1,file2) then do following.
      val=val "," arr2[i]                           ##Append arr2 value to val.
    }
  }
  $5=val                                            ##Assign val to 5th field here.
}
1                                                   ##Printing edited/non-edited line here.
' Input_file2  Input_file1                          ##Mentioning Input_file names here.
本文链接:https://www.f2er.com/5480.html

大家都在问