titanicData <- read.csv("DataForLabs/titanic.csv", stringsAsFactors = TRUE)
library(ggplot2)
blackbird <- read.csv("DataForLabs/chap12e2BlackbirdTestosterone.csv")
t.test(blackbird$logAfterImplant, blackbird$logBeforeImplant, paired = TRUE)
mean(blackbird$logAfterImplant)
mean(blackbird$logBeforeImplant)
blackbird <- read.csv("DataForLabs/chap12e2BlackbirdTestosterone.csv")
View(blackbird)
plot(beforeImplant~afterImplant,data=blackbird)
plot(logBeforeImplant~logAfterImplant,data=blackbird)
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 9 ######
# COMPARING MEANS OF TWO GROUPS
# This file contains all of the commands in the Learning the Tools section of tutorial 9.
titanicData <- read.csv("DataForLabs/titanic.csv", stringsAsFactors = TRUE)
library(ggplot2)
#### Stripchart ####
ggplot(titanicData, aes(x = survive, y = age)) +
geom_jitter(position = position_jitter(0.05)) +
theme_minimal()
# multiple histograms
ggplot(titanicData, aes(x = age)) +
geom_histogram() +
facet_wrap(~ survive, ncol = 1)
#violin plots
ggplot(titanicData, aes(x=survive, y=age, fill = survive)) +
geom_violin() +
xlab("Survival") + ylab("Age") +
theme_classic()+scale_fill_manual(values=c("#FFB531","#BC211A"))+
stat_summary(fun.y=mean,  geom="point", color="black")+
theme(legend.position="none")+
theme(aspect.ratio=1)
####  2-sample t-test
t.test(age ~ survive, data = titanicData, var.equal = TRUE)
####  Welch's t-test
t.test(age ~ survive, data = titanicData, var.equal = FALSE)
##### Paired t-test ####
blackbird <- read.csv("DataForLabs/chap12e2BlackbirdTestosterone.csv")
t.test(blackbird$logAfterImplant, blackbird$logBeforeImplant, paired = TRUE)
#### Levene's test ####
library(car)
leveneTest(data = titanicData, age ~ survive, center = mean)
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 8 ######
# Normal distribution and sample means
# This file contains all of the commands in the Learning the Tools section of tutorial 8.
library(ggplot2)
titanicData <- read.csv("DataForLabs/titanic.csv", stringsAsFactors = TRUE)
#  Histograms
ggplot(titanicData, aes(x = age)) + geom_histogram(binwidth = 10)
hist(titanicData$age)
# QQ plots
qqnorm(titanicData$age)
qqline(titanicData$age)
# Random numbers from a normal distribution
rnorm(n = 20, mean = 13, sd = 4)
normal_vector <- rnorm(n = 100, mean = 13, sd = 4)
qqnorm(normal_vector)
qqline(normal_vector)
#  QQ plots for a non-normal disitrbution
countries <- read.csv("DataForLabs/countries.csv", stringsAsFactors = TRUE)
popSizes = countries$total_population_in_thousands_2015
hist(popSizes, breaks = 10)
qqnorm(popSizes)
qqline(popSizes)
# QQ plot from non-normal distribution
log(titanicData$age)
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 7 ######
# Contingency analysis
# This file contains all of the commands in the Learning the Tools section of tutorial 7.
titanicData <- read.csv("DataForLabs/titanic.csv", stringsAsFactors = TRUE)
#### Factors
levels(titanicData$survive)
titanicData$survive <- factor(titanicData$survive, levels = c("yes", "no"))
####  Frequency tables ####
sex_survive_table <- table(titanicData$sex, titanicData$survive)
sex_survive_table
sex_survive_table_direct <- data.frame(yes = c(307,142), no = c(156,708),
row.names = c("female","male"))
####  Mosaic pots ####
mosaicplot(sex_survive_table)
mosaicplot(sex_survive_table, color = c("darkred","gold"), xlab  ="Sex", ylab = "Survival")
#### Odds ratios ####
sex_survive_fisher = fisher.test(sex_survive_table)
sex_survive_fisher$estimate
sex_survive_fisher$conf.int
#### Chi-squared contingency analysis ####
chisq.test(sex_survive_table)$expected
chisq.test(sex_survive_table, correct=FALSE)
#### Fisher's exact test ####
fisher.test(sex_survive_table)
##### Question 5  #####
tCountTable = data.frame(Common = c (25, 4), Uncommon = c(55, 10), row.names = c("Found", "Not found"))
MMlist$color
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 6 ######
# Frequency data
# This file contains all of the commands in the Learning the Tools section of tutorial 6.
#  Confidence intervals for a proportion
# install.packages("binom", dependencies = TRUE)
# Remove the # from the previous line if you need to install the package "binom" for the first time.
library(binom)
binom.confint(x = 30, n = 87, method = "ac")
#  Binomial test
binom.test(x = 14, n = 18, p = 0.5)
# Goodness of fit
# table
MMlist <- read.csv("DataForLabs/MandMlist.csv")
MMlist$color
MMtable <- table(MMlist$color)
MMtable
#  chisq.test()
expected_proportions <- c(0.24, 0.14, 0.16, 0.20, 0.13, 0.13)
sum(MMtable)
55 * expected_proportions
chisq.test(MMtable, p = expected_proportions)
#  Goodness of fit to a Poisson distribution
extinctData <- read.csv("DataForLabs/MassExtinctions.csv", stringsAsFactors = TRUE)
head(extinctData)
number_of_extinctions <- extinctData$numberOfExtinctions
table(number_of_extinctions)
#Generating expected frequencies
mean_extinctions <- mean(number_of_extinctions)
mean_extinctions
dpois(x = 3, lambda = 4.21)
0:20
expected_probability = dpois(x = 0:20, lambda = 4.21)
expected_probability
length(number_of_extinctions)
76 * expected_probability
expected_combined <- c(5.878568, 9.999264, 14.032300, 14.768996, 12.435494,  8.725572, 5.247808, 4.911998)
observed_combined <- c(13, 15, 16, 7, 10, 4, 2, 9)
# chi-squared statistic
chisq.test(observed_combined, p = expected_combined, rescale.p = TRUE)$statistic
#calculating the p-value
pchisq(q = 23.939, df = 6, lower.tail = FALSE)
####  Making frequency distribution plot for Activity
freqs <- dbinom(0:5, size = 5, p = 1/6)
num3s <- 0:5
diceFreqTable <-data.frame(num3s,freqs)
ggplot(data = diceFreqTable, aes(x=num3s, y=freqs) )+
geom_bar(colour="black", fill="orange", width=1, stat="identity") +
xlab("Number of 3's") + ylab("Predicted frequency") +
theme_classic() +
scale_x_continuous(breaks=seq(0, 5, 1)) +
theme(axis.text=element_text(size=12),
axis.title=element_text(size=16,face="bold"))
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 5 ######
# Describing data
# This file contains all of the commands in the Learning the Tools section of tutorial 5.
# Missing data
titanicData <- read.csv("DataForLabs/titanic.csv", stringsAsFactors = TRUE)
titanicData$age
#mean
mean(titanicData$age)
mean(titanicData$age, na.rm = TRUE)
#median and summary
median(titanicData$age, na.rm = TRUE)
summary(titanicData$age)
# Measures of variability
var(titanicData$age, na.rm = TRUE)
sd(titanicData$age, na.rm = TRUE)
sqrt(var(titanicData$age, na.rm = TRUE))
# Coefficient of variation
100 * sd(titanicData$age, na.rm = TRUE) / mean(titanicData$age, na.rm = TRUE)
# Interquartile range
IQR(titanicData$age, na.rm = TRUE)
# Confidence interval for the mean
t.test(titanicData$age)$conf.int
t.test(titanicData$age, conf.level = 0.99)$conf.int
# Optional -- calculating the Standard error of the mean
SEM = function(datalist){sd(datalist, na.rm = TRUE) / sqrt(sum(!is.na(datalist)))}
SEM(titanicData$age)
#  Quantiles
quantile(titanicData$age, c(0.025, 0.975), na.rm = TRUE)
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 4  #######
#  Sampling
# Let's write a function to speed up the Week 3 lab, that takes a data frame
# with some number of shell measurements and:
#    (1) Sample x individuals by row and display
#    (2) Display the meansures for those rows
#    (3) Calculate the mean of those measures
length=seq(1:30)
population_data = as.data.frame(length) ### Replace with your real data!
# For the following function "mean_from_a_sample", datalist should be a column
# that gives a list of a single numerical variable over a number of individuals.
# This will represent the population from which the sample is taken.
# sample_size is the number of individuals that you want to
# have in the sample
mean_from_a_sample <- function(datalist,sample_size) {
population_size = length(datalist)  # Establish the size of the "population"
sample_numbers = sample.int(population_size,sample_size)  # chooses which individuals to sample
sample_values = datalist[sample_numbers]  # Looks up the values for those individuals in the data
sample_mean = mean(sample_values,na.rm=TRUE)   # Calculates the mean of thoise sampled individuals
sample_mean
}
#####   Copy the above function all the way to here to paste into the console.
##### Taking 1000 samples and listing the mean of each sample
sample_mean_length = rep(0,1000)
for(i in 1:1000) sample_mean_length[i]= mean_from_a_sample(population_data$length,5)
##Calculating the mean of all sample means
mean(sample_mean_length)
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 3 ######
# Graphics
# This file contains all of the commands in the Learning the Tools section of tutorial 3.
library(ggplot2)
titanicData <- read.csv("DataForLabs/titanic.csv", stringsAsFactors = TRUE)
# ggplot
# Histograms
ggplot(titanicData, aes(x=age)) +
geom_histogram()
# Bar graphs
ggplot(titanicData, aes(x=sex)) +
geom_bar(stat="count")
# Box plots
ggplot(titanicData, aes(x=sex, y=age)) + geom_boxplot()
# scatterplots
guppyFatherSonData <- read.csv("DataForLabs/chap02e3bGuppyFatherSonAttractiveness.csv", stringsAsFactors = TRUE)
ggplot(guppyFatherSonData, aes(x=fatherOrnamentation, y=sonAttractiveness)) +
geom_point()
# Graphics options
ggplot(guppyFatherSonData, aes(x=fatherOrnamentation,y=sonAttractiveness)) +
geom_point() +
xlab("Father's ornamentation") + ylab("Sons attractiveness")
ggplot(guppyFatherSonData, aes(x=fatherOrnamentation,y=sonAttractiveness)) +
geom_point() +
xlab("Father's ornamentation") + ylab("Sons attractiveness") +
theme_minimal()
# getting help
help(ggplot)
library(dplyr)
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 1 ######
#  Intro to R part 1
# This file contains all of the commands in the Learning the Tools section of tutorial 1.
# Command line
2+2
log(42)
sqrt(4)
4^3
(1/(sqrt(2 * pi * (3.1)^2))) * exp(-(12-10.7)^2/(2*3.1))
#In the formula above, pi is 3.1415...., and the exp() function raises the base
#of the natural log (e = 2.78...) to the power of the value in the argument.
# Comments
# This is a comment.  Running this in R will have no effect.
# Functions
log(4, base = 10)
# Defining variables
x <- 4
x + 3
x <- 32
x
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 11 ######
# Correlation and regression
# This file contains all of the commands in the Learning the Tools section of tutorial 11.
library(ggplot2)
guppyData <- read.csv("DataForLabs/chap02e3bGuppyFatherSonAttractiveness.csv", stringsAsFactors = TRUE)
ggplot(guppyData, aes(x=fatherOrnamentation, y=sonAttractiveness)) +
geom_point() +
theme_minimal() +
xlab("Father's ornamentation") +
ylab("Son's attractiveness")
#### Correlation  ####
cor(guppyData$fatherOrnamentation, guppyData$sonAttractiveness)
cor(guppyData$fatherOrnamentation, guppyData$sonAttractiveness)^2
cor.test(guppyData$fatherOrnamentation, guppyData$sonAttractiveness)
#### Spearman's correlation  ####
cor.test(guppyData$fatherOrnamentation, guppyData$sonAttractiveness, method = "spearman")
#### Linear regression  ####
guppyRegression <- lm(sonAttractiveness ~ fatherOrnamentation, data = guppyData)
guppyRegression
summary(guppyRegression)
ggplot(guppyData, aes(x=fatherOrnamentation, y=sonAttractiveness)) +
geom_point() +
theme_minimal() +
xlab("Father's ornamentation") +
ylab("Son's attractiveness") +
geom_smooth(method = lm)
ggplot(guppyData, aes(x=fatherOrnamentation, y=sonAttractiveness)) +
geom_point() +
theme_minimal() +
xlab("Father's ornamentation") +
ylab("Son's attractiveness") +
geom_smooth(method = lm, se = FALSE)
#Residuals
residuals(guppyRegression)
plot(guppyData$fatherOrnamentation, residuals(guppyRegression))
abline(h=0)
######## Analysis of Biological Data Labs -- Learning the tools --  Tutorial 10 ######
# ANOVA
# This file contains all of the commands in the Learning the Tools section of tutorial 10.
#### ANOVA  ####
titanicData <- read.csv("DataForLabs/titanic.csv", stringsAsFactors = TRUE)
library(ggplot2)
ggplot(titanicData, aes(x = age)) +
geom_histogram() +
facet_wrap(~ passenger_class, ncol = 1)
library(dplyr)
titanic_by_passenger_class <- group_by(titanicData,passenger_class)
summarise(titanic_by_passenger_class, group_mean = mean(age, na.rm=TRUE))
summarise(titanic_by_passenger_class, group_mean = mean(age, na.rm=TRUE), group_sd = sd(age, na.rm=TRUE))
titanicANOVA <- lm(age ~ passenger_class, data = titanicData)
anova(titanicANOVA)
#### Tukey-Kramer  ####
TukeyHSD(aov(titanicANOVA))
#### Kruskall-Wallis test  ####
kruskal.test(age ~ passenger_class, data = titanicData)
