Counting k-mers in large sets of large sequences
From BioPerl
(This is the fruit of two scraps, Getting all k-mer combinations of residues and Sharing large arrays among multiple threads)
Marco Blanchette contributes the following scripts. Script 1 (countKmer.pl) processes a (possibly very) large sequence file using multiple threads, to return the frequency distribution of k-mers of arbitrary length k. Script 2 (sampleAndCountKmer.pl) does the same thing, if desired, or will randomly sample the sequences (with or without replacement) to return an empirical frequency distribution of k-mers. The user can also specify whether to count just presence/absence of a k-mer in a sequence, or to return a complete frequency distribution.
Script 1 (countKmer.pl)
#!/usr/local/bin/perl # countKmer.pl take a series of fasta files and count all the occurrence of all the # possible kmers in each files. It's a multi-threaded scripts that treat multiple files # asynchronously use strict; use warnings; use Getopt::Std; use threads; use Thread::Queue; our ($opt_k,$opt_m,$opt_p); our $SEP="\t"; MAIN:{ init(); my $q = Thread::Queue->new; $q->enqueue(@ARGV); my $num_workers = @ARGV < $opt_p ? @ARGV : $opt_p; # no need to be wasteful :) for (1 .. $num_workers) { threads->new(\&worker, $q); } $_->join for threads->list; } sub worker { my $queue = shift; while (my $filename = $queue->dequeue_nb) { processFile($filename); } return(0); } sub processFile { my $file = shift; #generate all possible word for a k-mer (define at run time by $opt_k) print STDERR "Generating all the posible sequences $opt_k long\n"; my $kmer_p = kmer_generator($opt_k); print STDERR "Counting the number of $opt_k nt long kmers in $file\n"; loadSeq($file,$kmer_p); ##print out the hits printHits($kmer_p,$file); } sub loadSeq { my $file = shift; my $kmer_p = shift; open FH, "$file" || die "Can't open file $file\n"; my $f=0; my $seq; while(<FH>){ if(/^>/){ countKmer(\$seq,$kmer_p) if $seq; $seq=''; $f=1; }elsif($f==1){ chomp; next if ""; $seq = join("",$seq,$_); } } close FH; return(0); } sub countKmer { my $seq_p = shift; my $kmer_p = shift; my $k = $opt_k; my %beenThere; for (my $i=0;$i <= length(${$seq_p})-$k;$i++){ my $w = substr(${$seq_p},$i,$k); unless ($opt_m){ #Count only one occurrence of a kmer per sequence $kmer_p->{$w}++ if !exists $beenThere{$w} && exists $kmer_p->{$w}; $beenThere{$w}=1; }else{ #Count all instances of a kmer per sequence $kmer_p->{$w}++ if exists $kmer_p->{$w}; } } return(0); } sub printHits { my $kmer_p=shift; my $file = shift; ##print out the hits my ($dir,$pre,$suf) = ($file =~ /(^.+\/|^)(.+)\.(.+$)/); open OUT, ">$pre.hits" || die "Can't create file $pre.hits\n"; print OUT join($SEP, $_, $kmer_p->{$_}), "\n" for keys %{$kmer_p}; close OUT; return(0); } sub kmer_generator { my $k = shift; my $kmer_p; my @bases = ('A','C','G','T'); my @words = @bases; for (my $i=1;$i<$k;$i++){ my @newwords; foreach my $w (@words){ foreach my $b (@bases){ push (@newwords,$w.$b); } } undef @words; @words = @newwords; } map{ $kmer_p->{$_}=0} @words; return($kmer_p); } sub init { getopts("p:k:m"); unless (@ARGV){ print("\nUsage: countKmer.pl [-k 6 -p 4 -m] sequence_1.fa [sequence_2.fa ...]\n", "\tFor each possible words in the kmer of lenght -k,\n", "\tcount the number of time they are found in the fasta sequence file\n", "\t-k\tsize of the kmer to analyze. Default 6\n", "\t-m\twill count all possible kmer per sequences.\n", "\t\tDefault: only one kmer is counted per sequence entries\n", "\t-p\tThe number of jobs to process simultaneoulsy. Normally, the number of available processors\n", "\t\tDefault: 4\n", "\n", ); exit(1) } $opt_k=6 unless $opt_k; $opt_p=4 unless $opt_p; return(0); }
Script 2 (sampleAndCountKmer.pl)
#!/usr/local/bin/perl # sampleAndCountKmer.pl is a bootstrapping procedure that sample a list of sequences a # certain number of time and return the count of each kmer in each sampling. Again this # is a mutli-threaded scripts that will count multiple samples asynchronously use strict; use warnings; use Getopt::Std; use threads; use threads::shared; use Thread::Queue; our ($opt_f,$opt_k,$opt_m,$opt_p,$opt_s,$opt_n,$opt_w,$opt_v); our $SEP="\t"; MAIN:{ init(); my $file = $opt_f; my $seq_p = loadSeq($opt_f); #create the final result hash with shared arrays at each kmers. my $res = &share({}); $res = kmer_generator($opt_k); $res->{$_} = &share([]) for keys %{$res}; my $q = Thread::Queue->new; $q->enqueue(1..$opt_n); my $num_workers = $opt_p; # no need to be wasteful :) for (1 .. $num_workers) { threads->new(\&worker, $q, $seq_p, $res, $_); } for (threads->list){ $_->join; } ##Now, need to deconvolute the arrays containing all the counts for all kmers ##and spit out a table of count for all the samples print(join ("$SEP",$_,@{$res->{$_}}),"\n") for keys %$res; print STDERR "all done!\n" if $opt_v; } sub worker { my $queue = shift; my $seq_p = shift; my $res = shift; my $t_id = shift; while (my $q_id = $queue->dequeue_nb) { processSeq($q_id,$seq_p,$res); } return(0); } sub processSeq { my $q_id = shift; my $seq_p = shift; my $res = shift; #generate all possible word for a k-mer (define at run time by $opt_k) print STDERR "Generating all the posible sequences $opt_k nucleotid long kmer for sample $q_id\n" if $opt_v; my $kmer_p = kmer_generator($opt_k); #randomly sample the sequences my $sample = sampleSeq($seq_p,$q_id); #counting the number of kmers in the sample seq print STDERR "Counting the number of $opt_k nt long kmers in the $q_id sample\n" if $opt_v; map{ countKmer($_,$kmer_p) } @{$sample}; #put the hits in the final result kmer; map { push @{$res->{$_}}, $kmer_p->{$_} } keys %{$res}; return(0); } sub sampleSeq { my $seq_p = shift; my $q_id = shift; my $sample_size = $opt_s; #create a list of $sample_size items of randomly selected position in @seqs my @lines; unless ($opt_w){ ##Sampling with replacement while (scalar(@lines) < $sample_size){ push @lines, int(rand(@{$seq_p})); } }else{ ##Sampling without replacement my %lines; while (scalar(keys %lines) < $sample_size){ $lines{int(rand(@{$seq_p}))}=''; } @lines = keys %lines; } print STDERR "Sampling $sample_size items for the sample $q_id\n" if $opt_v; my @sample = map { $seq_p->[$_] } @lines; return(\@sample); } sub loadSeq { my $file = shift; my $seq_p; open FH, "$file" || die "Can't open file $file\n"; my $f=0; my $seq; while(<FH>){ if(/^>/){ push @{$seq_p}, $seq if $seq; $seq=''; $f=1; }elsif($f==1){ chomp; next if ""; $seq = join("",$seq,$_); } } close FH; return($seq_p); } sub countKmer { my $seq = shift; my $kmer_p = shift; my $k = $opt_k; my %beenThere; for (my $i=0;$i <= length($seq)-$k;$i++){ my $w = substr($seq,$i,$k); unless ($opt_m){ #Count only one occurrence of the kmer per sequence $kmer_p->{$w}++ if !exists $beenThere{$w} && exists $kmer_p->{$w}; $beenThere{$w}=1; }else{ #Count all instances of each kmers in a sequence $kmer_p->{$w}++ if exists $kmer_p->{$w}; } } return(0); } sub printHits { my $kmer_p=shift; my $file = shift; ##print out the hits my ($dir,$pre,$suf) = ($file =~ /(^.+\/|^)(.+)\.(.+$)/); open OUT, ">$pre.hits" || die "Can't create file $pre.hits\n"; print OUT join($SEP, $_, $kmer_p->{$_}), "\n" for keys %{$kmer_p}; close OUT; return(0); } sub kmer_generator { my $k = shift; my $q_id=shift; my $kmer_p; my @bases = ('A','C','G','T'); my @words = @bases; for (my $i=1;$i<$k;$i++){ my @newwords; foreach my $w (@words){ foreach my $b (@bases){ push (@newwords,$w.$b); } } undef @words; @words = @newwords; } map{ $kmer_p->{$_}=0} @words; return($kmer_p); } sub init { getopts("f:p:k:s:n:mwv"); unless ($opt_f){ print("\nUsage:\tsampleAndCountKmer.pl [-k 6 -p 4 -s 100000 -n 1000 -m -w] -f sequence.fa \n", "\twill return a table of counts for each k-mers of length -k\n", "\t-f\tFasta database. Required\n", "\t-k\tsize of the k-mer to analyze. Default 6\n", "\t-m\twill count all possible k-mer per sequence.\n", "\t\tDefault: only one k-mer is counted per sequence entries\n", "\t-s number of sequences to sample. Default: 100,000 sequences\n", "\t-n number of times to repeat the sampling/counting procedures. Default: 1000 times\n", "\t-w Sample without replacment: Default: with replacement\n", "\t-p\tThe number of jobs to process simultaneoulsy. Normally, the number of available processors\n", "\t\tDefault: 4\n", "\t-v Run in verbrose mode. Spits out a lot of infomation. Default: quiet\n", "\n", ); exit(1) } $opt_k=6 unless $opt_k; $opt_p=4 unless $opt_p; $opt_s=100000 unless $opt_s; $opt_n=1000 unless $opt_n; return(0); }