Ben Murrell / May 10 2019
Remix of Julia by Nextjournal

Pairwise test.

import Pkg; Pkg.add("Distributions")
using Distributions, StatsBase
0.5s
#Hamming distance
hamming(s1,s2) = sum(collect(s1) .!= collect(s2));

#Compute H
function rake_H(seqs)
  numseqs = length(seqs)
  H = Int[]
  for i in 1:numseqs
    for j in i+1:numseqs
      push!(H,hamming(seqs[i],seqs[j]))
    end
  end
  return H
end

#Compute the p value from H (S choose 2 version)
function rake_test_from_H(H)
  pi_s = mean(H)
  p_0 = exp(-pi_s)
  p_val_vec = 1 .- cdf.(Binomial(length(H),p_0),0:length(H))
  num_zero_dists = sum(H .== 0)
  if num_zero_dists > 0
  	return p_val_vec[num_zero_dists+1]
  else
    return 1.0
  end
end

#Compute the p value from seqs
function rake_test(seqs)
  numseqs = length(seqs)
  H = rake_H(seqs)
  return rake_test_from_H(H)
end
rake_test (generic function with 1 method)

1. Computing H from sequences simulated under a star phylogeny.

#Seq simulation functions
alp = ['A','C','G','T']
evolve_seq(seq,p) = join([ifelse(rand()<p*4/3,rand(alp),s) for s in seq])

#Running test replicates
numseqs = 200
seqlength = 100
reps = 10000
null_replicate_p_vals = Float64[]
for i in 1:reps
  parent_seq = join(rand(alp,seqlength))
  seqs = [evolve_seq(parent_seq,0.025) for i in 1:numseqs];
  push!(null_replicate_p_vals,rake_test(seqs))
end
println("False positives: $(sum(null_replicate_p_vals .< 0.05)/reps)")
#Distribution of p-values (should be uniform)
histogram(null_replicate_p_vals,bins = 0:0.025:1.0)

2. Directly generating H with pairwise Poisson values.

#Checking the test implementation by directly matching the pairwise assumptions
numseqs = 200
seqlength = 100
reps = 10000
null_replicate_p_vals = Float64[]
for i in 1:reps
  S = numseqs
  H = rand(Poisson(0.025*2*seqlength),Int(S*(S-1)/2))
  rake_test_from_H(H)
  push!(null_replicate_p_vals,rake_test_from_H(H))
end
println("False positives: $(sum(null_replicate_p_vals .< 0.05)/reps)")
histogram(null_replicate_p_vals,bins = 0:0.025:1.0)