#!/usr/bin/perl #exp_list_mc -Calculates the probability a gene set shows average expression #response different from a random group of genes sampled from the input dataset. # #Takes as input the file of expression values for all the genes and a #directory of gene lists. Finds the probability of the ave. expression #value of the group by Monto Carlo sampling the total gene set to find the #distrib. of expression values for a gene set the same size as the gene set #being tested. Computes ave. exp value probability statistic for each gene #list. # #Expression value file is expected to be tab delimited and have the gene name #in the 1st field, expression value in the 2nd field. # #The gene files are expected to have the a gene name in the 1st field on each #line. # #exp_list_mc -d{lists_dir} [ -sSamples ] < exp_data_file > output_by_list.txt # #Example: #exp_list_mc -d./gene_lists -s1000 < Expr_tab_delimited.txt > MC_results.txt # #--------------------------- #This program is free software; you can redistribute it and/or #modify it under the terms of the GNU General Public License #as published by the Free Software Foundation; either version 2 #of the License, or (at your option) any later version. # #This program is distributed in the hope that it will be useful, #but WITHOUT ANY WARRANTY; without even the implied warranty of #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #GNU General Public License for more details. # #You should have received a copy of the GNU General Public License #along with this program; if not, write to the Free Software #Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #--------------------------- # #Version: 0.010 # #Last modified: 9/05 #Written by Jim Lund, University of Kentucky, Department of Biology #jiml@uky.edu # #Web site: http://elegans.uky.edu #--------------------------- use Statistics::Descriptive; use Statistics::Distributions; # #Subroutines # sub LOAD_LISTS; sub RAND_SEED; sub SAMPLE; sub SAMPLE_RANK; sub NORMAL_PVAL; # #Global variables. # $times_to_sample=10000; # #Check for command line args. # for $i (@ARGV) { $switch=substr($i,0,2); # #First check arguments for 'cry for help' # if ($switch eq "-q") { print ; exit(0); } # #Directory of gene lists. # if ($switch eq "-d") {$list_dir=substr($i,2);} # #Directory of gene lists. # if ($switch eq "-s") {$times_to_sample=substr($i,2);} # #Interpret remaining arguments # if (substr($i,0,1) ne "-") {$output_file=$i;} } ($output_path)= $output_file=~/^(.*\/)/; # #If no list dir is given print useage info and exit. # if (!$list_dir) { print ; exit; } # #Read in all the expression data by gene. # #May need to modify for your input file. Expects tab-delimited text file, #gene name in first field, expr value in 3rd field. # my %expr; while () { chomp; split(/\t/,$_); my $gene=uc($_[0]); $expr{$gene}=$_[1]; } # #Get count and array of all the genes. # $total_genes=keys %expr; @all_genes=keys %expr; #print "Total genes: $total_genes\n"; # #Load all the gene lists. # &LOAD_LISTS($list_dir); # #Now calculate summary statistics for each gene group. # my (@median,@average,@count); for ($i=1;$i <= $#lists;$i++) { undef $total; undef $average; undef @values; undef @values_ordered; foreach $gene (keys %{$lists[$i]}) { if (exists $expr{$gene}) { $count[$i]++; $total += $expr{$gene}; $values[$count[$i]] = $expr{$gene}; } } my @values_ordered = sort {$values[$a] <=> $values[$b]} @values; $median[$i] = $values_ordered[int($#values_ordered/2)]; $median[$i] = sprintf("%.2f",$median[$i]); $average[$i] = sprintf("%.2f",($total/$count[$i])); } # #Seed the random number generator. # &RAND_SEED; # #Now pull out random sets the size of each list, and find the Pval for the #set. # print "list_title\tlist_genes\tgenes_with_data\taverage\tmedian\tpval\trank\n"; for ($i=1;$i <= $#lists;$i++) { undef @sample; my $list_size=keys %{$lists[$i]}; # #Adjust $times_to_sample down for very large lists. # if ($list_size > 500) { $this_samples = 10000; } else { $this_samples = $times_to_sample; } &SAMPLE($list_size,$this_samples); my ($rank,$pval) = &SAMPLE_RANK($average[$i]); my $bad_name_total = keys %{$bad_name[$i]}; if ($bad_name_total) { print "$list_title[$i], $list_size list genes, $bad_name_total gene names not recognized\n"; print STDERR "$list_title[$i], $list_size list genes, $bad_name_total gene names not recognized\n"; } print "$list_title[$i]\t$list_size\t$count[$i]\t$average[$i]\t$median[$i]\t$pval\trank is $rank of $this_samples\n\n"; } exit; # #End of main section. # # #Subroutines # ############################################################################ #Called by: main #Calls: none # # #Seed the random number generator. # # sub RAND_SEED { srand(time ^ $$ ^ unpack "%32L*", `ps axww | gzip`); } ############################################################################ #Called by: main #Calls: none # #Pull out the requested number of samples from the full list of genes. Sample #without replacement. Calc. the average expression value for each sample. # sub SAMPLE_RANK { my ($target) = @_; my($i,$min,$min_count,$max,$max_count,$count,$value); my($rank,$pval); #foreach $value (sort {$a <=> $b} @sample_ave) {print STDERR "$value\n";} foreach $value (sort {$a <=> $b} @sample_ave) { $count++; #DEBUG #$i++;print STDERR "$i, $value, $target\n"; if (($count == 1) && ($target < $value)) { $min_count=1; $max_count=1; last; } if ($target >= $value) { if ($min != $value) { $min=$value; $min_count=$count; } } if ($min && ($target <= $value)) { $max=$value; $max_count=$count; } if ($min && $max) {last;} #DEBUG #if ($last_val == $value) {print STDERR "$value\n";} #$last_val=$value; #print STDERR "Median value is $value\n" if ($count == int($times_to_sample/2)); #END-DEBUG } if (!$max_count) {$max_count=@sample_ave;} $rank=int(($min_count+$max_count)/2); if ($rank <= (@sample_ave/2)) {$rank++;} $pval=$rank/@sample_ave; if ($pval > .5) {$pval=1-$pval;} $pval*=2; $pval=sprintf("%1.4f",$pval); if ($rank == $this_samples) { $pval = &NORMAL_PVAL(\@sample_ave,$target); } #DEBUG #print STDERR "min,max,pval,rank,min_count,max_count:$min,$max,$pval,$rank,$min_count,$max_count\n"; return($rank,$pval); } ############################################################################ #Called by: main #Calls: none # #Pull out the requested number of samples from the full list of genes. Sample #without replacement. Calc. the average slope for each sample. # sub SAMPLE { my ($list_size,$times_to_sample)=@_; my ($i,$rand,$sample_size,%sample); undef @sample_ave; for ($i=0;$i < $times_to_sample;$i++) { undef %sample; $sample_size=0; while ($sample_size < $list_size) { $rand=rand($total_genes-1); if (($rand - int($rand)) < .5) {$rand=int($rand);} else {$rand=int($rand)+1;} $gene=$all_genes[$rand]; if (exists $sample{$gene}) {redo;} else { $sample{$gene}=1; $sample_size++; } } $sample_ave[$i]=&AVE(\%sample); #DEBUG #print STDERR "$i, $sample_ave[$i]\n"; } } ############################################################################ #Called by: main #Calls: none # #Average the genes in the hash passed in. # sub AVE { my ($sample_ref)=@_; my ($gene,$ave); my $total=0; my $count=0; foreach $gene (keys %$sample_ref) { $total+=$expr{$gene}; $count++; } $ave=$total/$count; return($ave); } ############################################################################ #Called by: main #Calls: none # #Get the list of genes from the form. # sub LOAD_LISTS { my ($load_dir)=@_; @files=&GET_DIR($list_dir); $list_dir=~s:/*$:/:; my ($line,$file,$gene); my $list_number=0; foreach $file (@files) { open(LIST,'<'.$list_dir.$file) or warn "Can't open file $file: $!\n"; $list_number++; $list_title[$list_number]=$file; while ($line=) { ($gene)= $line=~/^\s*(\S+)/; $gene=uc($gene); if ($gene) { if (!exists $expr{$gene}) { $bad_name[$list_number]{$gene}=1; next; } $lists[$list_number]{$gene}=1; } } close LIST or warn "Warning, error closing $file: $!\n"; } } ############################################################################ #Called by: &LOAD_LISTS #Calls: none # #Get the list of files from dir. # sub GET_DIR { my ($dir)=@_[0]; my ($file_path,$i,@files); (opendir(DIR,$dir)) || warn "Can't read files from directory $dir: $!"; @files= grep(!/^\./, readdir DIR); for ($i=0; $i <= $#files;$i++) { $file_path=$dir.'/'.$files[$i]; if ((-e $file_path) && (-z $file_path)) {splice(@files,$i,1);} if ((-e $file_path) && (!(-f $file_path))) {splice(@files,$i,1);} } closedir DIR || warn "Can't close directory $dir: $!"; return @files; } ############################################################################ #Called by: &SAMPLE_RANK #Calls: none # #Calculate a pval from a test value and a set of data assumed to by normally #distributed. # sub NORMAL_PVAL { my ($sample_ref,$value) = @_; #Create a new sparse statistics object. my $stat = Statistics::Descriptive::Sparse->new(); #Adds data to the statistics variable. $stat->add_data(@$sample_ref); #Returns the mean of the data. my $mean = $stat->mean(); #Returns the variance of the data. Division by n-1 is used. my $var = $stat->variance(); my $std_dev = $stat->standard_deviation(); # #Standardize value. # my $std_value = ($value - $mean) / $std_dev; my $uprob=Statistics::Distributions::uprob ($std_value); if ($uprob > .5) {$uprob = 1 - $uprob;} $uprob *= 2; $uprob = sprintf("%.3e",$uprob); #print STDERR "SD,mean val,new_val: $std_dev,$mean $value -> $std_value\n"; return $uprob; } # #End of program. __DATA__ exp_list_mc -Calculates the probability a gene set shows average expression response different from a random group of genes sampled from the input dataset. Takes as input the file of expression values for all the genes and a directory of gene lists. Finds the probability of the ave. expression distribution of expression values for a gene set the same size as the gene set being tested. Computes average expression value probability statistic for each gene list. Expression value file is expected to be tab delimited and have the gene name in the 1st field, expression value in the 2nd field. The gene files are expected to have the a gene name in the 1st field on each #line. exp_list_mc -d{lists_dir} [ -sSamples ] < exp_data_file > output_by_list.txt Example: exp_list_mc -d./gene_lists -s1000 < Expr_tab_delimited.txt > MC_results.txt Written by Jim Lund, University of Kentucky, Department of Biology Version: 0.010, jiml@uky.edu, http://elegans.uky.edu