|
这个程序,用perl语言实现了RNA序列翻译蛋白质序列的过程。
考虑了AG GU alternative splicing influence and start coden, stop coden
统计了RNA序列可生产蛋白质长度
引用请注明出处
#!/usr/bin/perl -w
# Program name: statProtein_splicing.pl
# Author : sunchen
# Contact : bbsunchen@gmail.com
# Date : 10/22/2011
# Last Update : 10/27/2011
# Reference : Please cite our following papers when you are using this script.
# Description :
#===============================================================================================================
use warnings;
use strict;
use Getopt::Long;
#my %opts;
#GetOptions(\%opts,"dir:s");
#my $usage= <<"USAGE";
#Program: $0
#INPUT:
#-dir full path of file
#OUTPUT:
#USAGE
#die $usage unless ($opts{dir} && -e $opts{dir});
#open DIR, $opts{dir};
#create a hash table contain the coden and Aa
my %Aa=
(
"UUU"=>"F",
"UUC"=>"F",
"UUA"=>"L",
"UUG"=>"L",
"CUU"=>"L",
"CUC"=>"L",
"CUA"=>"L",
"CUG"=>"L",
"AUU"=>"I",
"AUC"=>"I",
"AUA"=>"I",
"AUG"=>"START",
"GUU"=>"V",
"GUC"=>"V",
"GUA"=>"V",
"GUG"=>"V",
"UCU"=>"S",
"UCC"=>"S",
"UCA"=>"S",
"UCG"=>"S",
"CCU"=>"P",
"CCC"=>"P",
"CCA"=>"P",
"CCG"=>"P",
"ACU"=>"T",
"ACC"=>"T",
"ACA"=>"T",
"ACG"=>"T",
"GCU"=>"A",
"GCC"=>"A",
"GCA"=>"A",
"GCG"=>"A",
"UAU"=>"Y",
"UAC"=>"Y",
"UAA"=>"END",
"UAG"=>"END",
"CAU"=>"H",
"CAC"=>"H",
"CAA"=>"Q",
"CAG"=>"Q",
"AAU"=>"N",
"AAC"=>"N",
"AAA"=>"K",
"AAG"=>"K",
"GAU"=>"D",
"GAC"=>"D",
"GAA"=>"E",
"GAG"=>"E",
"UGU"=>"C",
"UGC"=>"C",
"UGA"=>"END",
"UGG"=>"W",
"CGU"=>"R",
"CGC"=>"R",
"CGA"=>"R",
"CGG"=>"R",
"AGU"=>"S",
"AGC"=>"S",
"AGA"=>"R",
"AGG"=>"R",
"GGU"=>"G",
"GGC"=>"G",
"GGA"=>"G",
"GGG"=>"G",
);
open DIR, "<E:/lab/4_AUG";
open OUT, ">E:/lab/out.txt";
my $protein = 0;
my $coden = 0;
my @array;
my $array_len = 0;
my $if_exist = 0;#$if_exist
my $seq = "";
my $start_point = 0;
my $end_point = 0;
my $order = 0;
my @coden_size;
sub detect_protein
{
if($if_exist == 1)
{
goto END_OF_SUB;
}
$coden = 0;
my($string) = @_;
@array=split "", $string;
$array_len = @array;
my $position = -1;
$position = index($string,"AUG");
my $if_start = 0;
my $to_print = "";
if($position == -1)
{
goto END_OF_SUB;
}else
{
$to_print = $to_print."M";
$if_start = 1;
$coden++;
}
my $pro = "";
for(my $i = $position+3; $i < $array_len-2; $i=$i+3)
{
$pro = $array[$i].$array[$i+1].$array[$i+2];
#print OUT $pro."\n";
#print OUT $Aa{$pro}."\n";
if($Aa{$pro} eq "START")
{
$coden++;
$to_print = $to_print."M";
}elsif($Aa{$pro} eq "END")
{
$if_start = 2;
$to_print = $to_print."\n";
$coden++;
last;
}else
{
$coden++;
$to_print = $to_print.$Aa{$pro};
}
}
if($if_start == 1)
{
$to_print = $to_print."\n";
}
if($if_start==2 && $if_exist == 0)
{
$protein++;
$if_exist = 1;
#print OUT $order."\n";
if($coden == 72)
{
print $to_print."\n";
}
#print OUT $to_print;
$coden_size[$coden]++;
#print OUT $string."\n";
}
END_OF_SUB:
}
while(<DIR>)
{
$order ++;
$seq = $_;
$if_exist = 0;
my $splicing_site_ag = 0;
$splicing_site_ag = index($seq,"AG");
my @ag_site=();
push(@ag_site,$splicing_site_ag);
while($splicing_site_ag != -1)
{
#print "$splicing_site_ag\n";
$splicing_site_ag = index($seq,"AG",$splicing_site_ag+2);
push(@ag_site,$splicing_site_ag);
}
my $splicing_site_gu = 0;
$splicing_site_gu = rindex($seq,"GU");
my @gu_site=();
push(@gu_site,$splicing_site_gu);
while($splicing_site_gu != -1)
{
#print "$splicing_site_gu\n";
$splicing_site_gu = rindex($seq,"GU",$splicing_site_gu-1);
if($splicing_site_gu != -1 && defined($splicing_site_gu))
{
push(@gu_site,$splicing_site_gu);
}else
{
last;
}
}
for(my $j = 0; $j < @ag_site;$j++)
{
if($ag_site[$j] > 270)
{
last;
}
if($if_exist == 1)
{
last;
}
for(my $k = @gu_site-1; $k >= 0;$k--)
{
#print JK "J:$ag_site[$j]##########K:$gu_site[$k]\n";
#print OUT "$ag_site[$j] < $gu_site[$k]\n";
if($ag_site[$j] < $gu_site[$k])
{
if(($gu_site[$k] - $ag_site[$j] - 2) >= 30)
{
my $substr = substr($seq, $ag_site[$j]+2, $gu_site[$k] - $ag_site[$j] - 2);
#print OUT2 "$substr\n";
my @sub_array = split "",$substr;
my $final_length = @sub_array;
detect_protein($substr);
last;
}
}
}
}
}
for(my $i = 0; $i < @coden_size; $i++)
{
print "$i\n";
}
print "###########################################\n";
for(my $i = 0; $i < @coden_size; $i++)
{
if(defined($coden_size[$i]))
{
print "$coden_size[$i]\n";
}else
{
print "0\n";
}
}
close DIR;
close OUT;
|
|