[Bioperl-guts-l] [14816] bioperl-live/trunk: * write PROJECT support

Christopher John Fields cjfields at dev.open-bio.org
Thu Aug 21 12:00:12 EDT 2008


Revision: 14816
Author:   cjfields
Date:     2008-08-21 12:00:12 -0400 (Thu, 21 Aug 2008)

Log Message:
-----------
* write PROJECT support
* test case
* round-trip tests
* WGS, similar should conform with other annotation tag names (all lc)
* bring handler-based parser up-to-date

Modified Paths:
--------------
    bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
    bioperl-live/trunk/Bio/SeqIO/genbank.pm
    bioperl-live/trunk/t/Handler.t
    bioperl-live/trunk/t/genbank.t

Added Paths:
-----------
    bioperl-live/trunk/t/data/NC_008536.gb

Modified: bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
===================================================================
--- bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm	2008-08-21 13:37:50 UTC (rev 14815)
+++ bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm	2008-08-21 16:00:12 UTC (rev 14816)
@@ -671,6 +671,7 @@
                 my @genenames;
                 for my $section (split(m{\s*;\s*},$n)) {
                     my ($tag, $rest) = split("=",$section);
+                    $rest ||= '';
                     for my $val (split(m{\s*,\s*},$rest)) {
                         push @genenames, [$tag => $val];
                     }
@@ -1114,7 +1115,7 @@
 sub _generic_simplevalue {
     my ($self, $data) = @_;
     $self->annotation_collection->add_Annotation(
-        Bio::Annotation::SimpleValue->new(-tagname => $data->{NAME},
+        Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
        -value => $data->{DATA})
         );
 }

Modified: bioperl-live/trunk/Bio/SeqIO/genbank.pm
===================================================================
--- bioperl-live/trunk/Bio/SeqIO/genbank.pm	2008-08-21 13:37:50 UTC (rev 14815)
+++ bioperl-live/trunk/Bio/SeqIO/genbank.pm	2008-08-21 16:00:12 UTC (rev 14816)
@@ -663,7 +663,7 @@
 		    $_ =~ /^(?:CONTIG)?\s+(.*)/;
 		    $annotation->add_Annotation(
 						Bio::Annotation::SimpleValue->new(-value   => $1,
-										  -tagname => 'CONTIG'));
+										  -tagname => 'contig'));
 		    $_ = $self->_readline;
 		}
 		$self->_pushback($_);
@@ -672,7 +672,7 @@
 		    chomp;
 		    $annotation->add_Annotation(
 						Bio::Annotation::SimpleValue->new(-value => $_,
-										  -tagname => $1));
+										  -tagname => lc($1)));
 		    $_ = $self->_readline;
 		}
 	    } elsif(! m{^(ORIGIN|//)} ) { # advance to the sequence, if any
@@ -821,6 +821,11 @@
 			      "\n");
 	    }
 	}
+    
+    # if there, write the PROJECT line    
+	for my $proj ( $seq->annotation->get_Annotations('project') ) {
+		$self->_print("PROJECT     ".$proj->value."\n");
+    }
 
 	# if there, write the DBSOURCE line
 	foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) {
@@ -961,20 +966,20 @@
 	}
 
 	# deal with WGS; WGS_SCAFLD present only if WGS is also present
-	if($seq->annotation->get_Annotations('WGS')) {
+	if($seq->annotation->get_Annotations('wgs')) {
 	    foreach my $wgs
-		(map {$seq->annotation->get_Annotations($_)} qw(WGS WGS_SCAFLD)) {
-		    $self->_print(sprintf ("%-11s %s\n",$wgs->tagname,
+		(map {$seq->annotation->get_Annotations($_)} qw(wgs wgs_scaffold)) {
+		    $self->_print(sprintf ("%-11s %s\n",uc($wgs->tagname),
 					   $wgs->value));
 		}
 	    $self->_show_dna(0);
 	}
-	if($seq->annotation->get_Annotations('CONTIG')) {
+	if($seq->annotation->get_Annotations('contig')) {
 	    my $ct = 0;
 	    my $cline;
-	    foreach my $contig ($seq->annotation->get_Annotations('CONTIG')) {
+	    foreach my $contig ($seq->annotation->get_Annotations('contig')) {
 		unless ($ct) {
-		    $cline = $contig->tagname."      ".$contig->value."\n";
+		    $cline = uc($contig->tagname)."      ".$contig->value."\n";
 		} else {
 		    $cline = "            ".$contig->value."\n";
 		}

Modified: bioperl-live/trunk/t/Handler.t
===================================================================
--- bioperl-live/trunk/t/Handler.t	2008-08-21 13:37:50 UTC (rev 14815)
+++ bioperl-live/trunk/t/Handler.t	2008-08-21 16:00:12 UTC (rev 14816)
@@ -7,7 +7,7 @@
     use lib 't/lib';
     use BioperlTest;
     
-    test_begin(-tests => 546);
+    test_begin(-tests => 550);
 	
     use_ok('Bio::SeqIO');
 }
@@ -342,11 +342,11 @@
                     -file   => test_input_file('O_sat.wgs'));
 $seq = $gb->next_seq;
 
-my @tests = ('WGS'        => 'AAAA02000001-AAAA02050231',
-            'WGS_SCAFLD' => 'CM000126-CM000137',
-            'WGS_SCAFLD' => 'CH398081-CH401163');
+my @tests = ('wgs'        => 'AAAA02000001-AAAA02050231',
+            'wgs_scafld' => 'CM000126-CM000137',
+            'wgs_scafld' => 'CH398081-CH401163');
 
-my @wgs = map {$seq->annotation->get_Annotations($_)} qw(WGS WGS_SCAFLD);
+my @wgs = map {$seq->annotation->get_Annotations(lc($_))} (qw(WGS WGS_SCAFLD));
 
 my $ct=0;
 
@@ -497,6 +497,48 @@
    'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
    'Firmicutes, Bacteria', 'Bug 2195');
 
+# bug 2569, PROJECT line support, read and write, round-tripping
+    
+$str = Bio::SeqIO->new(-format =>'gbdriver',
+                      -verbose => $verbose,
+                      -file => test_input_file('NC_008536.gb'));
+
+$seq = $str->next_seq;
+
+my $project = ($seq->annotation->get_Annotations('project'))[0];
+
+isa_ok($project, 'Bio::Annotation::SimpleValue');
+
+if ($project) {
+	is($project->value, 'GenomeProject:12638');
+} else {
+	ok(0, "PROJECT not parsed");
+}
+
+$outfile = test_output_file();
+
+$gb = Bio::SeqIO->new(-format => 'genbank',
+                              -verbose => $verbose,
+                       -file=> ">$outfile");
+
+$gb->write_seq($seq);
+
+$str = Bio::SeqIO->new(-format =>'gbdriver',
+                      -verbose => $verbose,
+                      -file => $outfile);
+
+$seq = $str->next_seq;
+
+$project = ($seq->annotation->get_Annotations('project'))[0];
+
+isa_ok($project, 'Bio::Annotation::SimpleValue');
+
+if ($project) {
+	is($project->value, 'GenomeProject:12638');
+} else {
+	ok(0, "Roundtrip test failed");
+}
+
 ################################## EMBL ##################################
 
 # Set to -1 for release version, so warnings aren't printed

Added: bioperl-live/trunk/t/data/NC_008536.gb
===================================================================
--- bioperl-live/trunk/t/data/NC_008536.gb	                        (rev 0)
+++ bioperl-live/trunk/t/data/NC_008536.gb	2008-08-21 16:00:12 UTC (rev 14816)
@@ -0,0 +1,195 @@
+LOCUS       NC_008536               2001 bp    DNA     linear   BCT 21-JUL-2008
+DEFINITION  Solibacter usitatus Ellin6076, complete genome.
+ACCESSION   NC_008536 REGION: 1000..3000
+VERSION     NC_008536.1  GI:116619145
+PROJECT     GenomeProject:12638
+KEYWORDS    .
+SOURCE      Solibacter usitatus Ellin6076
+  ORGANISM  Solibacter usitatus Ellin6076
+            Bacteria; Acidobacteria; Solibacteres; Solibacterales;
+            Solibacteraceae; Solibacter.
+REFERENCE   1  (bases 1 to 2001)
+  AUTHORS   Copeland,A., Lucas,S., Lapidus,A., Barry,K., Detter,J.C., Glavina
+            del Rio,T., Hammon,N., Israni,S., Dalin,E., Tice,H., Pitluck,S.,
+            Thompson,L.S., Brettin,T., Bruce,D., Han,C., Tapia,R., Gilna,P.,
+            Schmutz,J., Larimer,F., Land,M., Hauser,L., Kyrpides,N.,
+            Mikhailova,N., Janssen,P.H., Kuske,C.R. and Richardson,P.
+  CONSRTM   US DOE Joint Genome Institute
+  TITLE     Complete sequence of Solibacter usitatus Ellin6076
+  JOURNAL   Unpublished
+REFERENCE   2  (bases 1 to 2001)
+  CONSRTM   NCBI Genome Project
+  TITLE     Direct Submission
+  JOURNAL   Submitted (24-OCT-2006) National Center for Biotechnology
+            Information, NIH, Bethesda, MD 20894, USA
+REFERENCE   3  (bases 1 to 2001)
+  AUTHORS   Copeland,A., Lucas,S., Lapidus,A., Barry,K., Detter,J.C., Glavina
+            del Rio,T., Hammon,N., Israni,S., Dalin,E., Tice,H., Pitluck,S.,
+            Thompson,L.S., Brettin,T., Bruce,D., Han,C., Tapia,R., Gilna,P.,
+            Schmutz,J., Larimer,F., Land,M., Hauser,L., Kyrpides,N.,
+            Mikhailova,N., Janssen,P.H., Kuske,C.R. and Richardson,P.
+  CONSRTM   US DOE Joint Genome Institute
+  TITLE     Direct Submission
+  JOURNAL   Submitted (06-OCT-2006) US DOE Joint Genome Institute, 2800
+            Mitchell Drive B100, Walnut Creek, CA 94598-1698, USA
+COMMENT     PROVISIONAL REFSEQ: This record has not yet been subject to final
+            NCBI review. The reference sequence was derived from CP000473.
+            URL -- http://www.jgi.doe.gov
+            JGI Project ID: 3634513
+            Source DNA and bacteria available from Cheryl R. Kuske
+            (kuske at lanl.gov)
+            Contacts: Cheryl R. Kuske (kuske at lanl.gov)
+                 Paul Richardson (microbes at cuba.jgi-psf.org)
+            Quality assurance done by JGI-Stanford
+            Annotation done by JGI-ORNL and JGI-PGF
+            Finishing done by JGI-LANL
+            Finished microbial genomes have been curated to close all gaps with
+            greater than 98% coverage of at least two independent clones. Each
+            base pair has a minimum q (quality) value of 30 and the total error
+            rate is less than one per 50000.
+            The JGI and collaborators endorse the principles for the
+            distribution and use of large scale sequencing data adopted by the
+            larger genome sequencing community and urge users of this data to
+            follow them. It is our intention to publish the work of this
+            project in a timely fashion and we welcome collaborative
+            interaction on the project and analysis.
+            (http://www.genome.gov/page.cfm?pageID=10506376).
+            COMPLETENESS: full length.
+FEATURES             Location/Qualifiers
+     source          1..2001
+                     /organism="Solibacter usitatus Ellin6076"
+                     /mol_type="genomic DNA"
+                     /strain="Ellin6076"
+                     /db_xref="taxon:234267"
+     gene            <1..403
+                     /locus_tag="Acid_0001"
+                     /db_xref="GeneID:4431688"
+     CDS             <1..403
+                     /locus_tag="Acid_0001"
+                     /note="KEGG: aba:Acid345_0001 chromosomal replication
+                     initiator protein DnaA
+                     TIGRFAM: chromosomal replication initiator protein DnaA
+                     PFAM: Chromosomal replication initiator, DnaA C-terminal
+                     domain; Chromosomal replication initiator, DnaA
+                     SMART: AAA ATPase"
+                     /codon_start=1
+                     /transl_table=11
+                     /product="chromosomal replication initiator protein DnaA"
+                     /protein_id="YP_821302.1"

@@ Diff output truncated at 10000 characters. @@



More information about the Bioperl-guts-l mailing list