scFv FAD analysis
#= #Installing packages - this has already been done in this runtime, so does not have to be repeated. using Pkg Pkg.add(PackageSpec(name="NextGenSeqUtils", rev="1.0", url = "https://github.com/MurrellGroup/NextGenSeqUtils.jl.git")) Pkg.add.(["PyPlot", "StatsBase", "Compose", "GraphPlot", "LightGraphs", "Colors"]) =# #Loading packages using NextGenSeqUtils,PyPlot,GraphPlot,LightGraphs,Colors,Compose
length_vs_qual(all_pan4.fastq,alpha = 0.02)
#Filter and export filtered data fastq_filter(all_pan4.fastq, "all_pan4_filt.fastq",error_rate = 0.01, min_length = 700, max_length = 1300); seqs,phreds,seq_names = read_fastq("all_pan4_filt.fastq"); #Use the demux_dict() function to orient the sequences, and discard those whose primers do not match what is expected fwd_primers = ["GAGGAGGAGGAGGAGGAGGCGGGGC"] rev_primers = ["GAGGAGGAGGAGGAGGAGCCTGGCC"] demux_dic = demux_dict(seqs,fwd_primers,rev_primers,verbose=true,phreds = phreds,tol_one_error = true); #Trim off the full primer sequences (absolutely essential when primers contain degenerate/ambig bases) trimmed = [double_primer_trim(j[1],j[2],"GAGGAGGAGGAGGAGGAGGCGGGGCCCAGGCGGCCGAGCTC","GAGGAGGAGGAGGAGGAGCCTGGCCGGCCTGGCCACTAGT") for j in demux_dic[(1,1)]] reads,phredarr = [s[1] for s in trimmed],[s[2] for s in trimmed] #Run the Fast Amplicon Denoising algorithm centroids,sizes = FAD(reads,phreds = phredarr); #Sort sequences by size size_sortperm = reverse(sortperm(sizes)) centroids = centroids[size_sortperm] sizes = sizes[size_sortperm] #Export size-sorted FAD reconstructed variants write_fasta("Pan4_FAD.fasta",centroids,names = ["seq_$(j)_$(sizes[j])" for j in 1:length(sizes)])
#Look at distribution of stop codons AA_seqs = [translate_to_aa(s[1:end-mod(end,3)]) for s in centroids]; stop_filter = [occursin("*",AA) for AA in AA_seqs]; fig = figure(figsize=(10,3)) plot([1:length(sizes);][.!stop_filter],(sizes[.!stop_filter]),color="blue") plot([1:length(sizes);][stop_filter],(sizes[stop_filter]),".",color="red") xlabel("FAD variant index") ylabel("FAD variant size") fig
#Running identical filtering for Pan0, but no FAD reconstruction, as templates just aren't repeatedly sampled in this massively diverse dataset fastq_filter(all_pan0.fastq, "all_pan0_filt.fastq",error_rate = 0.01, min_length = 700, max_length = 1300); seqs,phreds,seq_names = read_fastq("all_pan0_filt.fastq"); demux_dic = demux_dict(seqs,fwd_primers,rev_primers,verbose=true,phreds = phreds,tol_one_error = true); trimmed = [double_primer_trim(j[1],j[2],"GAGGAGGAGGAGGAGGAGGCGGGGCCCAGGCGGCCGAGCTC","GAGGAGGAGGAGGAGGAGCCTGGCCGGCCTGGCCACTAGT") for j in demux_dic[(1,1)]] pan0reads,pan0phredarr = [s[1] for s in trimmed],[s[2] for s in trimmed] write_fastq("all_pan0_trimmed.fastq",pan0reads,pan0phredarr)
#Computing kmer-approximate distances for all Pan4 variants vs all Pan0 reads. pan4kmers = kmer_count.(centroids,6); pan0kmers = kmer_count.(pan0reads,6); min_dists = [minimum([corrected_kmer_dist(pan0,pan4) for pan0 in pan0kmers]) for pan4 in pan4kmers]; cross_minimum = minimum(min_dists)
0.0235507
#Plotting the smallest distance between each Pan4 variant and all reads in Pan0, showing that there are none that are close enough to plausibly originate from the same template fig = figure(figsize=(10,3)) plot(min_dists,".",color = "blue") xlabel("Pan4 Variant (largest to smallest)") ylabel("kmer distance to nearest Pan0") fig
#Computing neighbour graph, using lowest Pan4-to-Pan0 distance as graph threshold dist_mat = [corrected_kmer_dist(p1,p2) for p1 in pan4kmers, p2 in pan4kmers]; graph_mat = dist_mat .< cross_minimum for i in 1:length(sizes) graph_mat[i,i] = false end neighbour_graph = Graph(graph_mat);
#Approximating long vs short linker, used to color nodes long_linker = 0.99 .* [occursin("TCCTCTGGTGGCGGTGGCTCGGGCGGTGGTGGG",i) for i in centroids]; gplot(neighbour_graph, nodefillc=[RGBA(i,0.0,1-i,0.5) for i in long_linker], nodesize = sqrt.(sizes).+0.5, NODESIZE = 0.05)
#Computing graph components components_arr = connected_components(neighbour_graph); #Drawing graph of all components above size 5 component_of_interest_inds = sort(vcat(components_arr[length.(components_arr).>5]...)); component_graph = Graph(graph_mat[component_of_interest_inds,component_of_interest_inds]); gplot(component_graph, nodefillc=[RGBA(i,0.0,1-i,0.8) for i in long_linker[component_of_interest_inds]], nodesize = sqrt.(sizes[component_of_interest_inds]).+0.5, NODESIZE = 0.05)