[Bioperl-guts-l] [14785] bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm: Warn user when some sequences are discarded because they are too short

Florent E Angly fangly at dev.open-bio.org
Mon Aug 4 21:17:28 EDT 2008


Revision: 14785
Author:   fangly
Date:     2008-08-04 21:17:28 -0400 (Mon, 04 Aug 2008)

Log Message:
-----------
Warn user when some sequences are discarded because they are too short

Modified Paths:
--------------
    bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm

Modified: bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm
===================================================================
--- bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm	2008-08-04 20:57:43 UTC (rev 14784)
+++ bioperl-run/trunk/Bio/Tools/Run/TigrAssembler.pm	2008-08-05 01:17:28 UTC (rev 14785)
@@ -253,10 +253,6 @@
       $self->throw("Not a valid Bio::PrimarySeqI");
     }
   }
-  # Remove sequences less than 40 bp long (not supported by TIGR Assembler)
-  $seqs = $self->_clean_seqs($seqs, 40);
-  return undef if scalar @$seqs <= 0;
-  
   # Assemble
   my @asms;
   my $tot_nof_seqs = scalar @$seqs;
@@ -266,41 +262,18 @@
     my $last = $i+$max_nof_seqs-1;
     $last = $tot_nof_seqs-1 if $last > $tot_nof_seqs-1;
     my @seq_subset = @$seqs[$first..$last];
-    # Write temp FASTA and QUAL input files
+    # Write temp FASTA and QUAL input files, removing sequences less than 40bp
     my ($fasta_file, $qual_file) = $self->_write_seq_file(\@seq_subset);
     # Assemble
-    my ($asm_obj, $asm_file) = $self->_run($fasta_file, $qual_file);
-    push @asms, $asm_obj
+    if (defined $fasta_file) {
+      my ($asm_obj, $asm_file) = $self->_run($fasta_file, $qual_file);
+      push @asms, $asm_obj
+    }
   }
   return \@asms;
 }
 
 
-=head2 _clean_seqs
-
- Title   :   _clean_seqs
- Usage   :   $assembler->_clean_seqs(\@seqs, $min_length);
- Function:   Remove sequences less than a given length
- Returns :   Bio::PrimarySeq object array reference
- Args    :   Bio::PrimarySeq object array reference
-
-=cut
-
-sub _clean_seqs {
-  my ($self, $seqs, $min_length) = @_;
-  my $size = scalar @$seqs;
-  for ( my $i = 0 ; $i < $size ; $i++ ) {
-    my $seq = $$seqs[$i];
-    if ($seq->length < $min_length) {
-      splice @$seqs, $i, 1;
-      $i--;
-      $size--;
-    }
-  }
-  return $seqs;
-}
-
-
 =head2 _write_seq_file
 
  Title   :   _write_seq_file
@@ -319,16 +292,27 @@
   my $fasta_out = Bio::SeqIO->new( -fh => $fasta_h , -format => 'fasta');
   my $qual_out = Bio::SeqIO->new( -fh => $qual_h , -format => 'qual');
   my $use_qual_file = 0;
-  for ( my $i = 0 ; $i < scalar @$seqs ; $i++ ) {
+  my $size = scalar @$seqs;
+  for ( my $i = 0 ; $i < $size ; $i++ ) {
     my $seq = $$seqs[$i];
-    # Make sure to give an ID if the sequence has none to prevent TIGR Assembler
-    # from crashing
+    # Make sure that all sequences have an ID (to prevent TIGR Assembler crash)
     if (not defined $seq->id) {
       my $newid = 'tmp'.$i;
       print $newid."\n";
       $seq->id($newid);
       $self->warn("A sequence had no ID. Its ID is now $newid");
     }
+    my $seqid = $seq->id;
+    # Remove sequences less than 40bp (not supported by TIGR_Assembler)
+    my $min_length = 40;
+    if ($seq->length < $min_length) {
+      splice @$seqs, $i, 1;
+      $i--;
+      $size--;
+      $self->warn("Sequence $seqid skipped: can not be assembled because its ".
+        "size is less than $min_length bp");
+      next;
+    }
     # Write the FASTA entries in files (and QUAL if appropriate)
     $fasta_out->write_seq($seq);
     if ($seq->isa('Bio::Seq::Quality') && scalar @{$seq->qual} > 0) {
@@ -340,6 +324,7 @@
   close($qual_h);
   $fasta_out->close();
   $qual_out->close();
+  return undef if scalar @$seqs <= 0;
   $qual_file = undef if $use_qual_file == 0;
   return $fasta_file, $qual_file;
 }




More information about the Bioperl-guts-l mailing list