[Bioperl-guts-l] [14785] bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm: Warn user when some sequences are discarded because they are too short
Florent E Angly
fangly at dev.open-bio.org
Mon Aug 4 21:17:28 EDT 2008
Revision: 14785
Author: fangly
Date: 2008-08-04 21:17:28 -0400 (Mon, 04 Aug 2008)
Log Message:
-----------
Warn user when some sequences are discarded because they are too short
Modified Paths:
--------------
bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm
Modified: bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm
===================================================================
--- bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm 2008-08-04 20:57:43 UTC (rev 14784)
+++ bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm 2008-08-05 01:17:28 UTC (rev 14785)
@@ -253,10 +253,6 @@
$self->throw("Not a valid Bio::PrimarySeqI");
}
}
- # Remove sequences less than 40 bp long (not supported by TIGR Assembler)
- $seqs = $self->_clean_seqs($seqs, 40);
- return undef if scalar @$seqs <= 0;
-
# Assemble
my @asms;
my $tot_nof_seqs = scalar @$seqs;
@@ -266,41 +262,18 @@
my $last = $i+$max_nof_seqs-1;
$last = $tot_nof_seqs-1 if $last > $tot_nof_seqs-1;
my @seq_subset = @$seqs[$first..$last];
- # Write temp FASTA and QUAL input files
+ # Write temp FASTA and QUAL input files, removing sequences less than 40bp
my ($fasta_file, $qual_file) = $self->_write_seq_file(\@seq_subset);
# Assemble
- my ($asm_obj, $asm_file) = $self->_run($fasta_file, $qual_file);
- push @asms, $asm_obj
+ if (defined $fasta_file) {
+ my ($asm_obj, $asm_file) = $self->_run($fasta_file, $qual_file);
+ push @asms, $asm_obj
+ }
}
return \@asms;
}
-=head2 _clean_seqs
-
- Title : _clean_seqs
- Usage : $assembler->_clean_seqs(\@seqs, $min_length);
- Function: Remove sequences less than a given length
- Returns : Bio::PrimarySeq object array reference
- Args : Bio::PrimarySeq object array reference
-
-=cut
-
-sub _clean_seqs {
- my ($self, $seqs, $min_length) = @_;
- my $size = scalar @$seqs;
- for ( my $i = 0 ; $i < $size ; $i++ ) {
- my $seq = $$seqs[$i];
- if ($seq->length < $min_length) {
- splice @$seqs, $i, 1;
- $i--;
- $size--;
- }
- }
- return $seqs;
-}
-
-
=head2 _write_seq_file
Title : _write_seq_file
@@ -319,16 +292,27 @@
my $fasta_out = Bio::SeqIO->new( -fh => $fasta_h , -format => 'fasta');
my $qual_out = Bio::SeqIO->new( -fh => $qual_h , -format => 'qual');
my $use_qual_file = 0;
- for ( my $i = 0 ; $i < scalar @$seqs ; $i++ ) {
+ my $size = scalar @$seqs;
+ for ( my $i = 0 ; $i < $size ; $i++ ) {
my $seq = $$seqs[$i];
- # Make sure to give an ID if the sequence has none to prevent TIGR Assembler
- # from crashing
+ # Make sure that all sequences have an ID (to prevent TIGR Assembler crash)
if (not defined $seq->id) {
my $newid = 'tmp'.$i;
print $newid."\n";
$seq->id($newid);
$self->warn("A sequence had no ID. Its ID is now $newid");
}
+ my $seqid = $seq->id;
+ # Remove sequences less than 40bp (not supported by TIGR_Assembler)
+ my $min_length = 40;
+ if ($seq->length < $min_length) {
+ splice @$seqs, $i, 1;
+ $i--;
+ $size--;
+ $self->warn("Sequence $seqid skipped: can not be assembled because its ".
+ "size is less than $min_length bp");
+ next;
+ }
# Write the FASTA entries in files (and QUAL if appropriate)
$fasta_out->write_seq($seq);
if ($seq->isa('Bio::Seq::Quality') && scalar @{$seq->qual} > 0) {
@@ -340,6 +324,7 @@
close($qual_h);
$fasta_out->close();
$qual_out->close();
+ return undef if scalar @$seqs <= 0;
$qual_file = undef if $use_qual_file == 0;
return $fasta_file, $qual_file;
}
More information about the Bioperl-guts-l
mailing list