scFv FAD analysis

#=
#Installing packages - this has already been done in this runtime, so does not have to be repeated.
using Pkg
Pkg.add(PackageSpec(name="NextGenSeqUtils", rev="1.0", url = "https://github.com/MurrellGroup/NextGenSeqUtils.jl.git"))
Pkg.add.(["PyPlot", "StatsBase", "Compose", "GraphPlot", "LightGraphs", "Colors"])
=#

#Loading packages
using NextGenSeqUtils,PyPlot,GraphPlot,LightGraphs,Colors,Compose

22.2s

scFV_NGSutils_runtime (Julia)

all_pan0.fastq

all_pan4.fastq

length_vs_qual(all_pan4.fastq
,alpha = 0.02)

12.9s

scFV_NGSutils_runtime (Julia)

#Filter and export filtered data
fastq_filter(all_pan4.fastq
, "all_pan4_filt.fastq",error_rate = 0.01, min_length = 700, max_length = 1300);
seqs,phreds,seq_names = read_fastq("all_pan4_filt.fastq");

#Use the demux_dict() function to orient the sequences, and discard those whose primers do not match what is expected
fwd_primers = ["GAGGAGGAGGAGGAGGAGGCGGGGC"]
rev_primers = ["GAGGAGGAGGAGGAGGAGCCTGGCC"]
demux_dic = demux_dict(seqs,fwd_primers,rev_primers,verbose=true,phreds = phreds,tol_one_error = true);

#Trim off the full primer sequences (absolutely essential when primers contain degenerate/ambig bases)
trimmed = [double_primer_trim(j[1],j[2],"GAGGAGGAGGAGGAGGAGGCGGGGCCCAGGCGGCCGAGCTC","GAGGAGGAGGAGGAGGAGCCTGGCCGGCCTGGCCACTAGT") for j in demux_dic[(1,1)]]
reads,phredarr = [s[1] for s in trimmed],[s[2] for s in trimmed]

#Run the Fast Amplicon Denoising algorithm
@time centroids,sizes = FAD(reads,phreds = phredarr);

#Sort sequences by size
size_sortperm = reverse(sortperm(sizes))
centroids = centroids[size_sortperm]
sizes = sizes[size_sortperm]

#Export size-sorted FAD reconstructed variants
write_fasta("Pan4_FAD.fasta",centroids,names = ["seq_$(j)_$(sizes[j])" for j in 1:length(sizes)])

46.5s

scFV_NGSutils_runtime (Julia)

#Look at distribution of stop codons
AA_seqs = [translate_to_aa(s[1:end-mod(end,3)]) for s in centroids];
stop_filter = [occursin("*",AA) for AA in AA_seqs];
fig = figure(figsize=(10,3))
plot([1:length(sizes);][.!stop_filter],(sizes[.!stop_filter]),color="blue")
plot([1:length(sizes);][stop_filter],(sizes[stop_filter]),".",color="red")
xlabel("FAD variant index")
ylabel("FAD variant size")
fig

1.8s

scFV_NGSutils_runtime (Julia)

#Running identical filtering for Pan0, but no FAD reconstruction, as templates just aren't repeatedly sampled in this massively diverse dataset
fastq_filter(all_pan0.fastq
, "all_pan0_filt.fastq",error_rate = 0.01, min_length = 700, max_length = 1300);
seqs,phreds,seq_names = read_fastq("all_pan0_filt.fastq");
demux_dic = demux_dict(seqs,fwd_primers,rev_primers,verbose=true,phreds = phreds,tol_one_error = true);
trimmed = [double_primer_trim(j[1],j[2],"GAGGAGGAGGAGGAGGAGGCGGGGCCCAGGCGGCCGAGCTC","GAGGAGGAGGAGGAGGAGCCTGGCCGGCCTGGCCACTAGT") for j in demux_dic[(1,1)]]
pan0reads,pan0phredarr = [s[1] for s in trimmed],[s[2] for s in trimmed]
write_fastq("all_pan0_trimmed.fastq",pan0reads,pan0phredarr)

14.7s

scFV_NGSutils_runtime (Julia)

#Computing kmer-approximate distances for all Pan4 variants vs all Pan0 reads.
pan4kmers = kmer_count.(centroids,6);
pan0kmers = kmer_count.(pan0reads,6);
min_dists = [minimum([corrected_kmer_dist(pan0,pan4) for pan0 in pan0kmers]) for pan4 in pan4kmers];
cross_minimum = minimum(min_dists)

34.5s

scFV_NGSutils_runtime (Julia)

0.0235507

#Plotting the smallest distance between each Pan4 variant and all reads in Pan0, showing that there are none that are close enough to plausibly originate from the same template
fig = figure(figsize=(10,3))
plot(min_dists,".",color = "blue")
xlabel("Pan4 Variant (largest to smallest)")
ylabel("kmer distance to nearest Pan0")
fig

0.9s

scFV_NGSutils_runtime (Julia)

#Computing neighbour graph, using lowest Pan4-to-Pan0 distance as graph threshold
dist_mat = [corrected_kmer_dist(p1,p2) for p1 in pan4kmers, p2 in pan4kmers];
graph_mat = dist_mat .< cross_minimum
for i in 1:length(sizes)
    graph_mat[i,i] = false
end
neighbour_graph = Graph(graph_mat);

1.8s

scFV_NGSutils_runtime (Julia)

#Approximating long vs short linker, used to color nodes
long_linker = 0.99 .* [occursin("TCCTCTGGTGGCGGTGGCTCGGGCGGTGGTGGG",i) for i in centroids];

gplot(neighbour_graph,
    nodefillc=[RGBA(i,0.0,1-i,0.5) for i in long_linker],
    nodesize = sqrt.(sizes).+0.5,
    NODESIZE = 0.05)

14.2s

scFV_NGSutils_runtime (Julia)

#Computing graph components
components_arr = connected_components(neighbour_graph);

#Drawing graph of all components above size 5
component_of_interest_inds = sort(vcat(components_arr[length.(components_arr).>5]...));
component_graph = Graph(graph_mat[component_of_interest_inds,component_of_interest_inds]);
gplot(component_graph,
    nodefillc=[RGBA(i,0.0,1-i,0.8) for i in long_linker[component_of_interest_inds]],
    nodesize = sqrt.(sizes[component_of_interest_inds]).+0.5,
    NODESIZE = 0.05)

1.9s

scFV_NGSutils_runtime (Julia)