# R Function to generate empirical estimates of substitution probabilities in a matrix
# of sequences generated with the Jukes-Cantor molec evol model. 
#
# Author: Jeff Krause
###############################################################################

# This function takes as an input a matrix of nucleotide sequences, as generated by the 
# "seq_evol" function, and tallies the single step substitution frequencies to come up with 
# substitution probabilities, to confirm that the "seq_evol" function is working as intended.

nuc_subs <- function(x) {

	seqMatrix <- x
	numGen <- log2(dim(seqMatrix)[1])	# assign the number of generations
	seqLen <- dim(seqMatrix)[2]			# assign the sequence length
	nuc <- c("A", "G", "C", "T")		# vector of nucleotides
	freqMat <<- matrix(0,4,4) 	# frequency table for parent/daughter char pairs
	colnames(freqMat) <<- nuc
	rownames(freqMat) <<- nuc
	
	# Now loop through the matrix of sequences looking at each character position for all pairs of 
	# parent/daughter sequence and tally the number of occurences of all 16 possible outcomes of 
	# the nucleotide present in the parent sequence and the nucleotide present in the daughter 
	# sequence
	
	# Start with the last row. The parent sequence of this sequence is at row 2^numGen. 
	# Move to the next sequence

	for (i in numGen:1) {				# loop through each generation
		for (j in (2^i):(2^(i-1)+1)) {		# loop through each sequence in a generation
			genOffset = 2^i-j			# Find the current offset from 2^i
			for (k in 1:seqLen) {		# loop through each sequence position
				colIndex <- which(nuc == seqMatrix[2^(i-1)-genOffset,k])  # which nuc in parent seq
				rowIndex <- which(nuc == seqMatrix[j,k])  				# which nuc in daughter seq
				freqMat[rowIndex,colIndex] <<- freqMat[rowIndex,colIndex] + 1	# increment freq table
			}
		}
	}
	return(freqMat)
}

nuc_comp <- function(x) {
	nucSeq <- x
	nuc <- c("A", "G", "C", "T")		# vector of nucleotides
	freq <- c(0, 0, 0, 0)
	names(freq) <- nuc
	if (length(dim(nucSeq)) == 0) {
		for (i in 1:length(nucSeq)) {
			j <- which(nuc == nucSeq[i])
			freq[j] = freq[j] + 1
		}
	}
	else {
		for (k in 1:dim(nucSeq)[1]) {
			for (i in 1:dim(nucSeq)[2]) {
				j <- which(nuc == nucSeq[k,i])
				freq[j] = freq[j] + 1
			}
		}
	}
	prob <- freq/sum(freq)
	return(prob)
}