# R Function to generate empirical estimates of substitution probabilities in a matrix # of sequences generated with the Jukes-Cantor molec evol model. # # Author: Jeff Krause ############################################################################### # This function takes as an input a matrix of nucleotide sequences, as generated by the # "seq_evol" function, and tallies the single step substitution frequencies to come up with # substitution probabilities, to confirm that the "seq_evol" function is working as intended. nuc_subs <- function(x) { seqMatrix <- x numGen <- log2(dim(seqMatrix)[1]) # assign the number of generations seqLen <- dim(seqMatrix)[2] # assign the sequence length nuc <- c("A", "G", "C", "T") # vector of nucleotides freqMat <<- matrix(0,4,4) # frequency table for parent/daughter char pairs colnames(freqMat) <<- nuc rownames(freqMat) <<- nuc # Now loop through the matrix of sequences looking at each character position for all pairs of # parent/daughter sequence and tally the number of occurences of all 16 possible outcomes of # the nucleotide present in the parent sequence and the nucleotide present in the daughter # sequence # Start with the last row. The parent sequence of this sequence is at row 2^numGen. # Move to the next sequence for (i in numGen:1) { # loop through each generation for (j in (2^i):(2^(i-1)+1)) { # loop through each sequence in a generation genOffset = 2^i-j # Find the current offset from 2^i for (k in 1:seqLen) { # loop through each sequence position colIndex <- which(nuc == seqMatrix[2^(i-1)-genOffset,k]) # which nuc in parent seq rowIndex <- which(nuc == seqMatrix[j,k]) # which nuc in daughter seq freqMat[rowIndex,colIndex] <<- freqMat[rowIndex,colIndex] + 1 # increment freq table } } } return(freqMat) } nuc_comp <- function(x) { nucSeq <- x nuc <- c("A", "G", "C", "T") # vector of nucleotides freq <- c(0, 0, 0, 0) names(freq) <- nuc if (length(dim(nucSeq)) == 0) { for (i in 1:length(nucSeq)) { j <- which(nuc == nucSeq[i]) freq[j] = freq[j] + 1 } } else { for (k in 1:dim(nucSeq)[1]) { for (i in 1:dim(nucSeq)[2]) { j <- which(nuc == nucSeq[k,i]) freq[j] = freq[j] + 1 } } } prob <- freq/sum(freq) return(prob) }