# Remember that help and details on any function mentioned in this script can be # obtained by typing "help(function.name)". Help on functions in add-on packages # are only available when those packages are installed and loaded using the # "library" function as show below # This exercise demonstrates the use of the "seqinr" package in R for importing, # manipulating and analyzing biiological sequnce data. If you haven't installed # the seqinr package, you can use the Package Installer from the Packages & Data # menu, or you can use installed.packages() # lists the packages you have installed in R install.packages("seqinr", repos = "http://cran.r-project.org", dependencies = TRUE, clean = TRUE) # for Bioconductor packages it's a little different source("http://bioconductor.org/biocLite.R") biocLite("Biostrings") # If you would like to ensure that you have the latest release of the packages # that you have installed you can use the update packages command update.packages(repos = "http://cran.r-project.org", clean=TRUE) # Use the read.fasta command to read in FASTA format sequences, specifying # either amino acid or DNA, from the "seq_data" directory in the home # directory. The "seq_data" directory is # provided as a zip archive at te site this script was downloaded from. The # current working directory, which defaults to the home directory, can be # discovered and set with getwd and setwd commands. Extract the "seq_data" # directory and place it in the home directory in order to use these sequences # for the exercises. library(seqinr) library(biostrings) pdsc <- "FTLYGDGASNQGQVFESFNMAKLWNLPVVFCCENNKYGMGTAASRSSAMTEYFKRGQYIPGLKVNGMDIL" pdhs <- "AALWKLPCIFICENNRYGMGTSVERAAASTDYYKRGDFIPGLRVDGMDILCVREATRFAAAYCRSGKGPI" dotPlot(s2c(pdsc), s2c(pdhs)) # Read in the DNA sequence of tryptophan synthase mRNA from Zea maize (corn) tsZdna = read.fasta("http://shodor.org/talks/ACCA2010/Bio-IntroR/Tryp_synth_Zmaize", seqtype = "DNA") # The "summary" method returns the name, length, class and mode of a SeqFastadna # object. Compare this to the display of the entire SeqFastadna object. summary(tsZdna) tsZdna # We can see a listing of all of the methods available for an object of a # particular class methods(class = "SeqFastadna") # The getSequence method returns the sequence alone, as expected. getSequence(tsZdna) # Now read in the amino acid sequence of tryptophan synthase mRNA from Zea maize tsZaa = read.fasta("http://shodor.org/talks/ACCA2010/Bio-IntroR/Tryp_synth_AA_Zmaize", seqtype = "AA") summmary(tsZaa) ### need to add instructions for sequence download from databases as well # Similar methods exist SeqFastaAA objects, why are they different? methods(class = "SeqFastaAA") pdZaa = read.fasta("http://shodor.org/talks/ACCA2010/Bio-IntroR/pyr_dehyd_AA_Zmaize", seqtype = "DNA") getSequence(pdZaa) pdZaa[[1]]=toupper(getSequence(pdZaa)) dotPlot(getSequence(pdZaa), getSequence(tsZaa)) tsAaa = read.fasta("./seq_data/tryptophan_synthase_Arabidopsis_thaliana", seqtype = "AA") lwDdna = read.fasta("./seq_data/Lost_World_DinoDNA", seqtype = "DNA") # Using the read.fasta command with the seqtype="AA" argument creates a data # structure of class SeqFastaAA. Type one of the sequence names at the R prompt # to see that this structure includes the amino acid sequence, as well as name # and annotation information from the FASTA file header. # In order to carry out analyses of a sequence we need to isolate the sequence # data from the other information using the "getSequence" method. We can use the # R "table" function to tally the the frequencies of amino acids in a sequence. # The "seqinr" package includes a function called "count" that can carry out a # similar tally, but can be addapted in ways that are more useful for biological # sequence data. By default the "count" function uses the lower case nucleotide # alphabet for this tally, so we will need to provide it with the upper case # amino acid alphabet ("s2c" converts the string we provide into a vector of # characters). Remember that there are 20 naturally occuring amino acids, and # there are no amino acids represented by B, J, O, U, X, or Z. tzntab = table(getSequence(tsZdna)) # build table of nucleotide frequencies tzntab # show table of nucleotide frequencies count(getSequence(tsZdna), 1) count(getSequence(tsZdna), 2) count(getSequence(tsZdna), 3) tzatab = table(getSequence(tsZaa)) # build table of amino acid frequencies tzntab # show table of amino acid frequencies count(getSequence(trpsZea), by = 1, alphabet = s2c("ACDEFGHIKLMNPQRSTVWY")) # The "dotchart" function can be used to plot the amino acid frequencies dotchart(tztab) # plot the amino acid frequencies on a chart barplot(tztab) tztab = table(getSequence(trpsZea)) dotPlot(trpsZea, trpsAra) dotPlot(tsz, tsa)