# R Function to carry out molecular evolution according to the Jukes-Cantor (JC69) model. The 
# returned matrix of sequence values comprise a binary tree corresponding to multiple rounds of 
# DNA replication and cell division. The first sequence is generated randoml assuming equal 
# abundance of nucleotides. This first sequence then serves as a template for the second, sequences
# 1 and 2 then serve as templates for sequences 3 and 4, respectively. Sequences 1-4 serve as 
# templates for sequences 5-8, and so on.
#
# Author: Jeff Krause
###############################################################################

library(sfsmisc) # package needed for "digitBase" function used to generate binary count of sequences

# The function requires the length of the sequence (which will be randomly generated),
# the number of generations to be simulated, and the substitution rate parameter.

seq_evol <- function(seqLen, numGen, subRate) {

# Allocate a matrix to store all of the sequences resulting from the specified number 
# of generations, and place a random starting sequence in first row.
# use scan(what="") to get user input seq

	seqMatrix <- matrix(nrow=2^numGen, ncol=(seqLen))
	for (i in 1:seqLen) seqMatrix[1,i] <- switch(ceiling(4*runif(1)), "A", "C", "G", "T")
	
# Generate a vector of bit sums that represent the distance of a sequence in the resulting 
# matrix from the original starting sequence. The function "digitsBase" from {sfsmisc} 
# produce a matrix of digits counting the sequences in binary. The sum of the binary digits
# for each sequence number provides the distance in terms of the number of replication events  
# between the starting sequence and a sequence inthe matrix. The vector of bit sums is placed 
# in the final column of the matrix of sequences.
	rownames(seqMatrix) <- paste(seq(1:2^numGen), apply(digitsBase(0:(2^numGen-1)),2,sum), sep="-d")
	# colnames(seqMatrix) <- seq(1:seqLen)
	
# Loop through generations carrying out replication with probability of substitution
# calcaulted from substitution rate provided
    nuc <- c("A", "G", "C", "T")
	for (i in 1:numGen) { 				# outer loop cycles through each generation
		for (j in 1:2^(i-1)) { 			# next loop cycles through sequences being generated
			for (k in 1:seqLen) {  		# cycle through each sequence position
				# If a random variable is less than the probabilty for keeping the nucleotide in the 
				# parent sequence (based on JC69), then keep this nucleotide in the daughter sequence. 
				if (runif(1) < 0.25 + (0.75*exp(-4*subRate))) 
					seqMatrix[2^i-2^(i-1)+j, k] <- seqMatrix[j, k]
				# Otherwise, find out what was in this position in the parent sequence and randomly 
				# choose one of the other three nucleotides.
				else {
					omit <- which(nuc == seqMatrix[j,k]) 
					nucSet <- nuc[-omit]
					seqMatrix[2^i-2^(i-1)+j, k] <- switch(ceiling(3*runif(1)), nucSet[1], nucSet[2], nucSet[3])
				}
			}
		}
	}
	seqMatrix
}