Concatenating genbank entries

From BioPerl
Jump to: navigation, search

This script concatenates all contigs in a genbank file into one contig. Useful for browsing an entire, but fragmented, genome in a genome browser like Apollo. --Lskatz 21:18, 10 August 2012 (UTC)

#!/usr/bin/env perl
# author: Lee Katz <lkatz@cdc.gov>
 
use strict;
use warnings;
use Bio::Perl;
use Getopt::Long;
use Data::Dumper;
use File::Basename;
use List::Util qw/min max/;
use LKUtils qw/logmsg seqHash/;
 
my $settings={
  linker=>"NNNNNCACACACTTAATTAATTAAGTGTGTGNNNNN",
  #linker=>"NNNNNNNNNN",
};
 
exit(main());
 
sub main{
  die usage() if(@ARGV<1);
  GetOptions($settings,qw(infile=s outfile=s nameOfOrganism=s linker=s));
 
  my $file=$$settings{infile} || die "Missing infile\n".usage();
  my $outfile=$$settings{outfile} or die "Missing outfile\n".usage();
  $$settings{nameOfOrganism}||="StrainXYZ";
 
  my $seq=concatSeqs($file,$settings);
 
  my $seqout=Bio::SeqIO->new(-file=>">$outfile");
  $seqout->write_seq($seq);
 
  print "Output file is in $outfile\n";
 
  return 1;
}
 
sub concatSeqs{
  my($file,$settings)=@_;
  my %seq=seqHash($file);
  ## take the first sequence as the final one, and build on it
  #my ($finalSeqId,$finalSeq)=each(%seq);
  #delete($seq{$finalSeqId});
 
  my $finalSeqLength=0;
  $finalSeqLength+=$_->length+length($$settings{linker}) for(values(%seq));
 
  my $finalSeq=Bio::Seq->new(-id=>"$file",-seq=>"",-alphabet=>"dna");
  $finalSeq->add_SeqFeature(Bio::SeqFeature::Generic->new(
    -start=>1,
    -end=>$finalSeqLength,
    -primary=>"source",
    -tag=>{
      organism=>$$settings{nameOfOrganism},
    },
  ));
 
  my $lastStop=0; # last contig's stop coordinate
  while(my($id,$seq)=each(%seq)){
    my @feat=$seq->get_SeqFeatures();
    for my $f(@feat){
      $f->remove_tag("transl_table") if($f->has_tag("transl_table")); # for apollo
      $f->start($f->start+$lastStop);
      $f->end($f->end+$lastStop);
      $finalSeq->add_SeqFeature($f);
    }
    $finalSeq->seq($finalSeq->seq.$seq->seq.$$settings{linker});
    # add the linker - the "unsure" tag was the most appropriate one
    $finalSeq->add_SeqFeature(Bio::SeqFeature::Generic->new(
      -start=>$finalSeq->length-length($$settings{linker}),
      -end=>$finalSeq->length,
      -primary=>'intron',
      -tag=>{
        locus_tag=>"ENDOFCONTIG_".$seq->id,
        note=>"lengthOfContig=".$seq->length,
      }
    ));
    $lastStop+=$seq->length;
  }
  return $finalSeq;
}
 
sub usage{
  "Usage: perl $0 -i inputGenomicFile -s start -e end
  -i input file: the file extension dictates the format
  -o outfile
  -n name of organism (useful for genome browsers)
  -l linker (default: NNNNNNNNNN)
  ";
}
Personal tools
Namespaces
Variants
Actions
Main Links
documentation
community
development
Toolbox