[Bioperl-guts-l] [14816] bioperl-live/trunk: * write PROJECT support
Christopher John Fields
cjfields at dev.open-bio.org
Thu Aug 21 12:00:12 EDT 2008
Revision: 14816
Author: cjfields
Date: 2008-08-21 12:00:12 -0400 (Thu, 21 Aug 2008)
Log Message:
-----------
* write PROJECT support
* test case
* round-trip tests
* WGS, similar should conform with other annotation tag names (all lc)
* bring handler-based parser up-to-date
Modified Paths:
--------------
bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
bioperl-live/trunk/Bio/SeqIO/genbank.pm
bioperl-live/trunk/t/Handler.t
bioperl-live/trunk/t/genbank.t
Added Paths:
-----------
bioperl-live/trunk/t/data/NC_008536.gb
Modified: bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm
===================================================================
--- bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm 2008-08-21 13:37:50 UTC (rev 14815)
+++ bioperl-live/trunk/Bio/SeqIO/Handler/GenericRichSeqHandler.pm 2008-08-21 16:00:12 UTC (rev 14816)
@@ -671,6 +671,7 @@
my @genenames;
for my $section (split(m{\s*;\s*},$n)) {
my ($tag, $rest) = split("=",$section);
+ $rest ||= '';
for my $val (split(m{\s*,\s*},$rest)) {
push @genenames, [$tag => $val];
}
@@ -1114,7 +1115,7 @@
sub _generic_simplevalue {
my ($self, $data) = @_;
$self->annotation_collection->add_Annotation(
- Bio::Annotation::SimpleValue->new(-tagname => $data->{NAME},
+ Bio::Annotation::SimpleValue->new(-tagname => lc($data->{NAME}),
-value => $data->{DATA})
);
}
Modified: bioperl-live/trunk/Bio/SeqIO/genbank.pm
===================================================================
--- bioperl-live/trunk/Bio/SeqIO/genbank.pm 2008-08-21 13:37:50 UTC (rev 14815)
+++ bioperl-live/trunk/Bio/SeqIO/genbank.pm 2008-08-21 16:00:12 UTC (rev 14816)
@@ -663,7 +663,7 @@
$_ =~ /^(?:CONTIG)?\s+(.*)/;
$annotation->add_Annotation(
Bio::Annotation::SimpleValue->new(-value => $1,
- -tagname => 'CONTIG'));
+ -tagname => 'contig'));
$_ = $self->_readline;
}
$self->_pushback($_);
@@ -672,7 +672,7 @@
chomp;
$annotation->add_Annotation(
Bio::Annotation::SimpleValue->new(-value => $_,
- -tagname => $1));
+ -tagname => lc($1)));
$_ = $self->_readline;
}
} elsif(! m{^(ORIGIN|//)} ) { # advance to the sequence, if any
@@ -821,6 +821,11 @@
"\n");
}
}
+
+ # if there, write the PROJECT line
+ for my $proj ( $seq->annotation->get_Annotations('project') ) {
+ $self->_print("PROJECT ".$proj->value."\n");
+ }
# if there, write the DBSOURCE line
foreach my $ref ( $seq->annotation->get_Annotations('dblink') ) {
@@ -961,20 +966,20 @@
}
# deal with WGS; WGS_SCAFLD present only if WGS is also present
- if($seq->annotation->get_Annotations('WGS')) {
+ if($seq->annotation->get_Annotations('wgs')) {
foreach my $wgs
- (map {$seq->annotation->get_Annotations($_)} qw(WGS WGS_SCAFLD)) {
- $self->_print(sprintf ("%-11s %s\n",$wgs->tagname,
+ (map {$seq->annotation->get_Annotations($_)} qw(wgs wgs_scaffold)) {
+ $self->_print(sprintf ("%-11s %s\n",uc($wgs->tagname),
$wgs->value));
}
$self->_show_dna(0);
}
- if($seq->annotation->get_Annotations('CONTIG')) {
+ if($seq->annotation->get_Annotations('contig')) {
my $ct = 0;
my $cline;
- foreach my $contig ($seq->annotation->get_Annotations('CONTIG')) {
+ foreach my $contig ($seq->annotation->get_Annotations('contig')) {
unless ($ct) {
- $cline = $contig->tagname." ".$contig->value."\n";
+ $cline = uc($contig->tagname)." ".$contig->value."\n";
} else {
$cline = " ".$contig->value."\n";
}
Modified: bioperl-live/trunk/t/Handler.t
===================================================================
--- bioperl-live/trunk/t/Handler.t 2008-08-21 13:37:50 UTC (rev 14815)
+++ bioperl-live/trunk/t/Handler.t 2008-08-21 16:00:12 UTC (rev 14816)
@@ -7,7 +7,7 @@
use lib 't/lib';
use BioperlTest;
- test_begin(-tests => 546);
+ test_begin(-tests => 550);
use_ok('Bio::SeqIO');
}
@@ -342,11 +342,11 @@
-file => test_input_file('O_sat.wgs'));
$seq = $gb->next_seq;
-my @tests = ('WGS' => 'AAAA02000001-AAAA02050231',
- 'WGS_SCAFLD' => 'CM000126-CM000137',
- 'WGS_SCAFLD' => 'CH398081-CH401163');
+my @tests = ('wgs' => 'AAAA02000001-AAAA02050231',
+ 'wgs_scafld' => 'CM000126-CM000137',
+ 'wgs_scafld' => 'CH398081-CH401163');
-my @wgs = map {$seq->annotation->get_Annotations($_)} qw(WGS WGS_SCAFLD);
+my @wgs = map {$seq->annotation->get_Annotations(lc($_))} (qw(WGS WGS_SCAFLD));
my $ct=0;
@@ -497,6 +497,48 @@
'Acholeplasmataceae, Acholeplasmatales, Mollicutes, '.
'Firmicutes, Bacteria', 'Bug 2195');
+# bug 2569, PROJECT line support, read and write, round-tripping
+
+$str = Bio::SeqIO->new(-format =>'gbdriver',
+ -verbose => $verbose,
+ -file => test_input_file('NC_008536.gb'));
+
+$seq = $str->next_seq;
+
+my $project = ($seq->annotation->get_Annotations('project'))[0];
+
+isa_ok($project, 'Bio::Annotation::SimpleValue');
+
+if ($project) {
+ is($project->value, 'GenomeProject:12638');
+} else {
+ ok(0, "PROJECT not parsed");
+}
+
+$outfile = test_output_file();
+
+$gb = Bio::SeqIO->new(-format => 'genbank',
+ -verbose => $verbose,
+ -file=> ">$outfile");
+
+$gb->write_seq($seq);
+
+$str = Bio::SeqIO->new(-format =>'gbdriver',
+ -verbose => $verbose,
+ -file => $outfile);
+
+$seq = $str->next_seq;
+
+$project = ($seq->annotation->get_Annotations('project'))[0];
+
+isa_ok($project, 'Bio::Annotation::SimpleValue');
+
+if ($project) {
+ is($project->value, 'GenomeProject:12638');
+} else {
+ ok(0, "Roundtrip test failed");
+}
+
################################## EMBL ##################################
# Set to -1 for release version, so warnings aren't printed
Added: bioperl-live/trunk/t/data/NC_008536.gb
===================================================================
--- bioperl-live/trunk/t/data/NC_008536.gb (rev 0)
+++ bioperl-live/trunk/t/data/NC_008536.gb 2008-08-21 16:00:12 UTC (rev 14816)
@@ -0,0 +1,195 @@
+LOCUS NC_008536 2001 bp DNA linear BCT 21-JUL-2008
+DEFINITION Solibacter usitatus Ellin6076, complete genome.
+ACCESSION NC_008536 REGION: 1000..3000
+VERSION NC_008536.1 GI:116619145
+PROJECT GenomeProject:12638
+KEYWORDS .
+SOURCE Solibacter usitatus Ellin6076
+ ORGANISM Solibacter usitatus Ellin6076
+ Bacteria; Acidobacteria; Solibacteres; Solibacterales;
+ Solibacteraceae; Solibacter.
+REFERENCE 1 (bases 1 to 2001)
+ AUTHORS Copeland,A., Lucas,S., Lapidus,A., Barry,K., Detter,J.C., Glavina
+ del Rio,T., Hammon,N., Israni,S., Dalin,E., Tice,H., Pitluck,S.,
+ Thompson,L.S., Brettin,T., Bruce,D., Han,C., Tapia,R., Gilna,P.,
+ Schmutz,J., Larimer,F., Land,M., Hauser,L., Kyrpides,N.,
+ Mikhailova,N., Janssen,P.H., Kuske,C.R. and Richardson,P.
+ CONSRTM US DOE Joint Genome Institute
+ TITLE Complete sequence of Solibacter usitatus Ellin6076
+ JOURNAL Unpublished
+REFERENCE 2 (bases 1 to 2001)
+ CONSRTM NCBI Genome Project
+ TITLE Direct Submission
+ JOURNAL Submitted (24-OCT-2006) National Center for Biotechnology
+ Information, NIH, Bethesda, MD 20894, USA
+REFERENCE 3 (bases 1 to 2001)
+ AUTHORS Copeland,A., Lucas,S., Lapidus,A., Barry,K., Detter,J.C., Glavina
+ del Rio,T., Hammon,N., Israni,S., Dalin,E., Tice,H., Pitluck,S.,
+ Thompson,L.S., Brettin,T., Bruce,D., Han,C., Tapia,R., Gilna,P.,
+ Schmutz,J., Larimer,F., Land,M., Hauser,L., Kyrpides,N.,
+ Mikhailova,N., Janssen,P.H., Kuske,C.R. and Richardson,P.
+ CONSRTM US DOE Joint Genome Institute
+ TITLE Direct Submission
+ JOURNAL Submitted (06-OCT-2006) US DOE Joint Genome Institute, 2800
+ Mitchell Drive B100, Walnut Creek, CA 94598-1698, USA
+COMMENT PROVISIONAL REFSEQ: This record has not yet been subject to final
+ NCBI review. The reference sequence was derived from CP000473.
+ URL -- http://www.jgi.doe.gov
+ JGI Project ID: 3634513
+ Source DNA and bacteria available from Cheryl R. Kuske
+ (kuske at lanl.gov)
+ Contacts: Cheryl R. Kuske (kuske at lanl.gov)
+ Paul Richardson (microbes at cuba.jgi-psf.org)
+ Quality assurance done by JGI-Stanford
+ Annotation done by JGI-ORNL and JGI-PGF
+ Finishing done by JGI-LANL
+ Finished microbial genomes have been curated to close all gaps with
+ greater than 98% coverage of at least two independent clones. Each
+ base pair has a minimum q (quality) value of 30 and the total error
+ rate is less than one per 50000.
+ The JGI and collaborators endorse the principles for the
+ distribution and use of large scale sequencing data adopted by the
+ larger genome sequencing community and urge users of this data to
+ follow them. It is our intention to publish the work of this
+ project in a timely fashion and we welcome collaborative
+ interaction on the project and analysis.
+ (http://www.genome.gov/page.cfm?pageID=10506376).
+ COMPLETENESS: full length.
+FEATURES Location/Qualifiers
+ source 1..2001
+ /organism="Solibacter usitatus Ellin6076"
+ /mol_type="genomic DNA"
+ /strain="Ellin6076"
+ /db_xref="taxon:234267"
+ gene <1..403
+ /locus_tag="Acid_0001"
+ /db_xref="GeneID:4431688"
+ CDS <1..403
+ /locus_tag="Acid_0001"
+ /note="KEGG: aba:Acid345_0001 chromosomal replication
+ initiator protein DnaA
+ TIGRFAM: chromosomal replication initiator protein DnaA
+ PFAM: Chromosomal replication initiator, DnaA C-terminal
+ domain; Chromosomal replication initiator, DnaA
+ SMART: AAA ATPase"
+ /codon_start=1
+ /transl_table=11
+ /product="chromosomal replication initiator protein DnaA"
+ /protein_id="YP_821302.1"
@@ Diff output truncated at 10000 characters. @@
More information about the Bioperl-guts-l
mailing list