shell和perl对区间交集处理(坐标区间有重叠,则合并之并取其最小的起始坐标和最大的终止坐标)
-----------------------------------------------------------处理数据:
refseq1 813+ NM_152486 SAMD11
refseq1 611+ NM_152486 SAMD11
refseq1 1416+ NM_152486 SAMD11
refseq1 612- NM_021170 HES4
refseq1 810- NM_021170 HES4
refseq2 911- NM_001002919 FAM150B
refseq2 38- NM_001002919 FAM150B
--------------------------------------------------------
期望得到的结果是:
refseq1 613+ NM_152486_E1 SAMD11
refseq1 1416+ NM_152486_E2 SAMD11
refseq1 612- NM_021170_E1 HES4
refseq2 911- NM_001002919_E1 FAM150B
refseq2 38- NM_001002919_E2 FAM150B
------------------------------------------------------
1 ----------------------------------------------------------------------------------------
2 #!/usr/bin/perl
3 my %R;
4 while () {
5 my @s = split;
6 push @{ $R{ $s }{R} }, [ $s, $s ];
7 $R{ $s }{ $s } = [ @s[ 0, 3, 4, 5 ] ];
8 }
9
10 while ( my ( $k, $v ) = each %R ) {
11 my @tmp = sort { $a-> <=> $b-> } @{ $v->{R} };
12 my ( $E, @R ) = ( 1, shift @tmp );
13 for my $t (@tmp) {
14 $t-> > $R[-1] && push @R, $t and next;
15 $t-> > $R[-1] && ( $R[-1] = $t-> )
16 }
17 for my $R (@R) {
18 my ( $A, $B, $C, $D ) = @{ $v->{ $R-> } };
19 print join( "\t", $A, @$R, $B, "${C}_E" . $E++, $D ) . $/;
20 }
21 }
22 -------------------------------------------------------------------------------
1 -------------------------------------------------------------------------------
2 use strict;
3 use warnings;
4
5 my @name_array ;
6 my $hash;
7 while(){
8 #get serial from each line
9 my ($seq,$min,$max, $sign, $serial, $name) = split(/\s+/);
10
11 #in Python ,use infor convince
12 unless (grep $_-> eq $serial ,@name_array){
13 push@name_array, [$seq,$sign,$serial,$name];
14 } ;
15
16 ($min,$max) = ($max,$min) if $min > $max ;
17
18 if($hash->{$serial}){
19 sort_matrix($min,$max,$hash->{$serial});
20 }
21 else{
22 push @{$hash->{$serial}},[$min,$max];
23 }
24
25 }
26 ###print sort result
27 for my $item( @name_array ){
28 my $serial = $item->;
29 for (0..$#{$hash->{$serial}}){
30 print $item->," ",$hash->{$serial}->[$_]," ",$hash->{$serial}->[$_]," ",$item->," ",$serial,"_E",($_+1)," ",$item->,"\n";
31 }
32 }
33
34 sub sort_matrix{
35 my ($cur_min,$cur_max,$matrix) = @_;
36 # record index
37 my @splice_index;
38 for (0..$#$matrix){
39 my $item_min = $matrix->[$_]->;
40 my $item_max = $matrix->[$_]->;
41 next if $cur_max < $item_min or $cur_min > $item_max;
42 return if $cur_min >= $item_min and $cur_max <= $item_max;
43 $cur_min= $cur_min < $item_min ? $cur_min : $item_min;
44 $cur_max = $cur_max < $item_max ? $item_max : $cur_max;
45 push @splice_index, $_;
46 }
47 push @$matrix,[$cur_min,$cur_max];
48 #print "@splice_index\n";
49
50 for (0..$#splice_index){
51 splice(@$matrix,$splice_index[$_]-$_,1);##This is trick
52 }
53 #print $cur_min,":",$cur_max,"\n";
54 }
1 首先sort -k5 -k2 file ,使$2(坐标起点)升序排列,由于$2<$3(起点<终点)这个是恒成立的。所以按照我的这个思路第N+1行和第N行比较,然后合并。
2 # awk -v OFS="\t" 'NR==1{tag=$5;start=$2;end=$3;a=$6;b=$4;c=$1}NR>1{
3
4 if($5==tag){
5 if($2end)
6
7 end=$3
8
9 else if($3
10
11 else{
12
13 print ">"tag"_"++count,a,c,b,start,end;
14
15 start=$2;end=$3;tag=$5;a=$6;b=$4;c=$1
16
17 }
18
19
20 }
21
22
23 else {
24 print ">"tag"_"++count,a,c,b,start,end;
25
26 tag=$5;start=$2;end=$3;a=$6;b=$4;c=$1
27 count=0
28
29 }
30
31 }END{print ">"tag"_"++count,a,c,b,start,end}' <<<"refseq2 279363280352- NM_001002919 FAM150B
32 refseq2 282911283375- NM_001002919 FAM150B
33 refseq2 285923286403- NM_001002919 FAM150B
34 refseq2 286090286543- NM_001002919 FAM150B
35 refseq2 287383288092- NM_001002919 FAM150B
36 refseq2 287813288508- NM_001002919 FAM150B
37 refseq1 934142935012- NM_021170 HES4
38 refseq1 934706935193- NM_021170 HES4
39 refseq1 934872935367- NM_021170 HES4
40 refseq1 935046935752- NM_021170 HES4
41 refseq1 860921861380+ NM_152486 SAMD11
42 refseq1 861102861593+ NM_152486 SAMD11
43 refseq1 865335865916+ NM_152486 SAMD11
44 refseq1 866219866669+ NM_152486 SAMD11
45 refseq1 870952871476+ NM_152486 SAMD11
46 refseq1 874220874709+ NM_152486 SAMD11
47 refseq1 874455875040+ NM_152486 SAMD11
48 refseq1 879088880161+ NM_152486 SAMD11"
49 --------------------------------------------------------------------------
50 # cat file1
51 A 10 30
52 A 20 40
53 A 50 70
54 A1 38 168
55
56 # awk '{k=sprintf("%s_0d",$1,$2);for(n=0;n++<3;)a=$n;c}END{$1="";t=asorti(c,s);for(n=0;n++=a){a=$2;if($3>a)a=$3}else if($1)print $1,$2,$3;for(m=0;m++<3;)$m=a}}' file1
57 A1 38 168
58 A 10 40
59 A 50 70
页:
[1]