# R Function to carry out molecular evolution according to the Jukes-Cantor (JC69) model. The # returned matrix of sequence values comprise a binary tree corresponding to multiple rounds of # DNA replication and cell division. The first sequence is generated randoml assuming equal # abundance of nucleotides. This first sequence then serves as a template for the second, sequences # 1 and 2 then serve as templates for sequences 3 and 4, respectively. Sequences 1-4 serve as # templates for sequences 5-8, and so on. # # Author: Jeff Krause ############################################################################### library(sfsmisc) # package needed for "digitBase" function used to generate binary count of sequences # The function requires the length of the sequence (which will be randomly generated), # the number of generations to be simulated, and the substitution rate parameter. seq_evol <- function(seqLen, numGen, subRate) { # Allocate a matrix to store all of the sequences resulting from the specified number # of generations, and place a random starting sequence in first row. # use scan(what="") to get user input seq seqMatrix <- matrix(nrow=2^numGen, ncol=(seqLen)) for (i in 1:seqLen) seqMatrix[1,i] <- switch(ceiling(4*runif(1)), "A", "C", "G", "T") # Generate a vector of bit sums that represent the distance of a sequence in the resulting # matrix from the original starting sequence. The function "digitsBase" from {sfsmisc} # produce a matrix of digits counting the sequences in binary. The sum of the binary digits # for each sequence number provides the distance in terms of the number of replication events # between the starting sequence and a sequence inthe matrix. The vector of bit sums is placed # in the final column of the matrix of sequences. rownames(seqMatrix) <- paste(seq(1:2^numGen), apply(digitsBase(0:(2^numGen-1)),2,sum), sep="-d") # colnames(seqMatrix) <- seq(1:seqLen) # Loop through generations carrying out replication with probability of substitution # calcaulted from substitution rate provided nuc <- c("A", "G", "C", "T") for (i in 1:numGen) { # outer loop cycles through each generation for (j in 1:2^(i-1)) { # next loop cycles through sequences being generated for (k in 1:seqLen) { # cycle through each sequence position # If a random variable is less than the probabilty for keeping the nucleotide in the # parent sequence (based on JC69), then keep this nucleotide in the daughter sequence. if (runif(1) < 0.25 + (0.75*exp(-4*subRate))) seqMatrix[2^i-2^(i-1)+j, k] <- seqMatrix[j, k] # Otherwise, find out what was in this position in the parent sequence and randomly # choose one of the other three nucleotides. else { omit <- which(nuc == seqMatrix[j,k]) nucSet <- nuc[-omit] seqMatrix[2^i-2^(i-1)+j, k] <- switch(ceiling(3*runif(1)), nucSet[1], nucSet[2], nucSet[3]) } } } } seqMatrix }