Concatenating genbank entries
From BioPerl
This script concatenates all contigs in a genbank file into one contig. Useful for browsing an entire, but fragmented, genome in a genome browser like Apollo. --Lskatz 21:18, 10 August 2012 (UTC)
#!/usr/bin/env perl # author: Lee Katz <lkatz@cdc.gov> use strict; use warnings; use Bio::Perl; use Getopt::Long; use Data::Dumper; use File::Basename; use List::Util qw/min max/; use LKUtils qw/logmsg seqHash/; my $settings={ linker=>"NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNN", #linker=>"NNNNNNNNNN", }; exit(main()); sub main{ die usage() if(@ARGV<1); GetOptions($settings,qw(infile=s outfile=s nameOfOrganism=s linker=s)); my $file=$$settings{infile} || die "Missing infile\n".usage(); my $outfile=$$settings{outfile} or die "Missing outfile\n".usage(); $$settings{nameOfOrganism}||="StrainXYZ"; my $seq=concatSeqs($file,$settings); my $seqout=Bio::SeqIO->new(-file=>">$outfile"); $seqout->write_seq($seq); print "Output file is in $outfile\n"; return 1; } sub concatSeqs{ my($file,$settings)=@_; my %seq=seqHash($file); ## take the first sequence as the final one, and build on it #my ($finalSeqId,$finalSeq)=each(%seq); #delete($seq{$finalSeqId}); my $finalSeqLength=0; $finalSeqLength+=$_->length+length($$settings{linker}) for(values(%seq)); my $finalSeq=Bio::Seq->new(-id=>"$file",-seq=>"",-alphabet=>"dna"); $finalSeq->add_SeqFeature(Bio::SeqFeature::Generic->new( -start=>1, -end=>$finalSeqLength, -primary=>"source", -tag=>{ organism=>$$settings{nameOfOrganism}, }, )); my $lastStop=0; # last contig's stop coordinate while(my($id,$seq)=each(%seq)){ my @feat=$seq->get_SeqFeatures(); for my $f(@feat){ $f->remove_tag("transl_table") if($f->has_tag("transl_table")); # for apollo $f->start($f->start+$lastStop); $f->end($f->end+$lastStop); $finalSeq->add_SeqFeature($f); } $finalSeq->seq($finalSeq->seq.$seq->seq.$$settings{linker}); # add the linker - the "unsure" tag was the most appropriate one $finalSeq->add_SeqFeature(Bio::SeqFeature::Generic->new( -start=>$finalSeq->length-length($$settings{linker}), -end=>$finalSeq->length, -primary=>'intron', -tag=>{ locus_tag=>"ENDOFCONTIG_".$seq->id, note=>"lengthOfContig=".$seq->length, } )); $lastStop+=$seq->length; } return $finalSeq; } sub usage{ "Usage: perl $0 -i inputGenomicFile -s start -e end -i input file: the file extension dictates the format -o outfile -n name of organism (useful for genome browsers) -l linker (default: NNNNNNNNNN) "; }