Question: Does the protection of kelp bass from fishing through the creation of MPAs affect their average size?
Reading in data and displaying first six lines
fishData <- read.csv("Annual_fish_comb_20200108.csv")
head(fishData)
## YEAR MONTH DATE SITE TRANSECT QUAD SIDE VIS SP_CODE SIZE COUNT
## 1 2000 10 10/4/2000 NAPL 1 -99999 -99999 -99999 ANDA -99999 0
## 2 2000 10 10/4/2000 NAPL 2 -99999 -99999 -99999 ANDA -99999 0
## 3 2000 10 10/4/2000 NAPL 3 -99999 -99999 -99999 ANDA -99999 0
## 4 2000 10 10/4/2000 NAPL 4 -99999 -99999 -99999 ANDA -99999 0
## 5 2000 10 10/4/2000 NAPL 5 -99999 -99999 -99999 ANDA -99999 0
## 6 2000 10 10/4/2000 NAPL 6 -99999 -99999 -99999 ANDA -99999 0
## AREA SCIENTIFIC_NAME COMMON_NAME TAXON_KINGDOM TAXON_PHYLUM
## 1 80 Anisotremus davidsonii Sargo Animalia Chordata
## 2 80 Anisotremus davidsonii Sargo Animalia Chordata
## 3 80 Anisotremus davidsonii Sargo Animalia Chordata
## 4 80 Anisotremus davidsonii Sargo Animalia Chordata
## 5 80 Anisotremus davidsonii Sargo Animalia Chordata
## 6 80 Anisotremus davidsonii Sargo Animalia Chordata
## TAXON_CLASS TAXON_ORDER TAXON_FAMILY TAXON_GENUS GROUP SURVEY MOBILITY
## 1 Actinopterygii Perciformes Haemulidae Anisotremus FISH FISH MOBILE
## 2 Actinopterygii Perciformes Haemulidae Anisotremus FISH FISH MOBILE
## 3 Actinopterygii Perciformes Haemulidae Anisotremus FISH FISH MOBILE
## 4 Actinopterygii Perciformes Haemulidae Anisotremus FISH FISH MOBILE
## 5 Actinopterygii Perciformes Haemulidae Anisotremus FISH FISH MOBILE
## 6 Actinopterygii Perciformes Haemulidae Anisotremus FISH FISH MOBILE
## GROWTH_MORPH
## 1 SOLITARY
## 2 SOLITARY
## 3 SOLITARY
## 4 SOLITARY
## 5 SOLITARY
## 6 SOLITARY
Subsetting the data to only include kelp bass
## Subset data to only include kelp bass
bass1 <- subset(fishData, SP_CODE=="PCLA")
## Remove all -99999 from dataset
bass2 = bass1[-which(bass1$SIZE < 0),]
## Include only the columns of interest to simplify the dataset
bass3 = bass2[c("YEAR", "SITE", "SP_CODE", "SIZE", "COUNT")]
## Replicates the rows based on the values of the "COUNT" column
index <- rep(1:nrow(bass3), bass3$COUNT)
bass_size <- bass3[index, ]
##Calculate the average size of the kelp bass at each YEAR and SITE
head(bass_size)
## YEAR SITE SP_CODE SIZE COUNT
## 23232 2000 NAPL PCLA 15 1
## 23233 2000 NAPL PCLA 18 1
## 23234 2000 NAPL PCLA 20 1
## 23236 2000 NAPL PCLA 30 3
## 23236.1 2000 NAPL PCLA 30 3
## 23236.2 2000 NAPL PCLA 30 3
bass_avg = bass_size %>% group_by(YEAR, SITE, SP_CODE) %>%
summarize_at(vars(SIZE), funs(mean))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
##Plot average size of kelp bass by site by year
## The colors in each site give you a rough idea of the relative average size between sites in a given year
ggplot(bass_avg, aes(x=YEAR, y=SIZE))+
geom_col(aes(fill=SITE))+
labs(y="Total size across all sites")+
ggtitle("Average size of bass across sites and years")
## Filter out all sites except MOHK, CARP (non MPAs) and IVEE and NAPLES (MPAs)
bass_comp = bass_avg[which(bass_avg$SITE %in% c("MOHK", "CARP", "IVEE", "NAPL")),]
##Make a column that outputs "CONTROL" if CARP or MOHK. Will output "MPA" if IVEE or NAPL.
bass_comp$status <- ifelse(bass_comp$SITE == "MOHK" | bass_comp$SITE=="CARP", "CONTROL", "MPA")
##Group by MPA status and year. Get the mean of the two different states, by year
bass_comp_avg = bass_comp %>% group_by(YEAR, status, SP_CODE) %>%
summarize_at(vars(SIZE), funs(mean))
##Make a grouped column chart that shows avg bass size for each year inside and outside of MPAs
## *Note: MPA status starts in 2012.
# ggplot(bass_comp_avg, aes(fill=status, x=YEAR, y=SIZE))+
# geom_bar(position="stack", stat="identity")+
# ggtitle("Average kelp bass size inside and outside MPAs")
##Box plot for sites
bass_size_post <- bass_size[which(bass_size$YEAR > 2011), ]
bass_size_post$status <- ifelse(bass_size_post$SITE == "NAPL" | bass_size_post$SITE=="IVEE", "MPA", "CONTROL")
bass_size_post <- bass_size_post[-which(bass_size_post$SITE=="SCDI" | bass_size_post$SITE=="SCTW"),]
ggplot(bass_size_post, aes(x=status, y=SIZE, fill=status))+
geom_boxplot()+
labs(x="MPA Status", y="Average size (cm)")+
theme_classic()
Median kelp bass size across years. “Control” sites are the seven mainland sites that were not designated as MPA sites. Only Naples and Isla Vista reef make up the MPA treatment
# str(bass_comp_avg)
##Subset the data further to only include post mpa years for inside/outside
bass_comp_post <- bass_comp_avg[which(bass_comp_avg$YEAR > 2012), ]
# ## Remake the grouped column chart with the subsetted years
ggplot(bass_comp_post, aes(x=YEAR, y=SIZE, fill=status))+
geom_bar(position="dodge", stat="identity")+
labs(y="Average size (cm)")+
ggtitle("Average kelp bass size inside and outside of MPAs")
This graph shows the same data as the boxplot above, but displays the average kelp bass size inside and outside MPAs by year.
Note that MPAs were created in 2012. We would expect to see the average size of the kelp bass inside the MPAs to increase. However, this does not seem to be the case.
##This is the averaged data analysis
bass_comp_ttest = bass_comp[which(bass_comp$SITE %in% c("IVEE", "NAPL")),]
#subsetting the data into Pre MPA and Post MPA
pre_mpa = bass_comp_ttest[which(bass_comp_ttest$YEAR %in% c("2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011")),]
post_mpa = bass_comp_ttest[which(bass_comp_ttest$YEAR %in% c("2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019")),]
## Boxplot for pre and post average sizes
bass_comp_ttest$status <- ifelse(bass_comp_ttest$YEAR <2012, "PRE", "POST")
ggplot(bass_comp_ttest, aes(x=status, y=SIZE, fill=status))+
geom_boxplot()+
ggtitle("Kelp bass size pre and post MPA creation")+
theme(plot.title = element_text(hjust=0.5))
##First, let’s check to see if the bass sizes are normal
#Looking to see if the data is normal
shapiro.test(pre_mpa$SIZE)
##
## Shapiro-Wilk normality test
##
## data: pre_mpa$SIZE
## W = 0.92356, p-value = 0.116
qqPlot(pre_mpa$SIZE, main = "Pre MPA qqPlot", ylab = "Fish Size Data")
## [1] 16 8
ggplot(pre_mpa, aes(x=SIZE)) +
geom_histogram(binwidth=4, color="black", fill="white")+
ggtitle("Histogram for pre-mpa fish sizes")
shapiro.test(post_mpa$SIZE)
##
## Shapiro-Wilk normality test
##
## data: post_mpa$SIZE
## W = 0.96447, p-value = 0.7694
qqPlot(post_mpa$SIZE, main = "Post MPA qqPlot", ylab = "Fish Size Data")
## [1] 8 2
ggplot(post_mpa, aes(x=SIZE)) +
geom_histogram(binwidth=3, color="black", fill="white")+
ggtitle("Post-mpa kelp bass sizes: Histogram")
The data for Pre and Post MPA fall under the CI for normality for the qqplots
The p values are not smaller than 0.05 for Pre or Post MPA data, we fail to reject that data is normal
Next, lets test for equal variances
#Because data appears to be normal an F-test is conducted to see if variances are equal
var.test(pre_mpa$SIZE, post_mpa$SIZE)
##
## F test to compare two variances
##
## data: pre_mpa$SIZE and post_mpa$SIZE
## F = 0.86256, num df = 19, denom df = 14, p-value = 0.75
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3015197 2.2831444
## sample estimates:
## ratio of variances
## 0.8625639
#The F test has a p-value greater than 0.05, we fail to reject that variances are equal
#A two-sample t-test is conducted assuming equal variance to compare the difference in means
t.test(pre_mpa$SIZE, post_mpa$SIZE, var.equal=TRUE)
##
## Two Sample t-test
##
## data: pre_mpa$SIZE and post_mpa$SIZE
## t = 0.48795, df = 33, p-value = 0.6288
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -3.810393 6.214798
## sample estimates:
## mean of x mean of y
## 25.84527 24.64307
Null hypothesis: Size does not change between MPA statuses (the difference in means is 0).
Alternative hypothesis: Size does change between MPA statuses (the difference in means is not 0)
The p-value is greater than 0.05, we fail to reject that the difference in means is 0
There is no significant difference in means for Pre and Post MPA kelp bass sizes
# ##Subset the raw data to only include the sites IVEE and NAPL
bass_size_ttest2 = bass_size[which(bass_size$SITE %in% c("IVEE", "NAPL")),]
## First, we wrangle the data and create a "status" column in the dataset for pre/post mpa
bass_size_ttest2$status <- ifelse(bass_size_ttest2$YEAR <2012, "PRE", "POST")
Our initial analysis suggested that the average size of the kelp bass did not change after MPAs were established. This makes it appear that MPAs had no effect on kelp bass size. However, we wanted to do some additional testing and look at the number of kelp bass before and after MPA zones were established.
Null hypothesis: There is no difference in the average number of kelp bass over 30 cm before and afer MPA zones were established in sites IVEE and NAPL.
Alternative hypothesis: There IS a difference in the average number of kelp bass over 30 cm before and after MPA zones were established.
Only kelp bass over 14 inches (34 cm) can be eaten by fishermen. Therefore, there should be few kelp bass over this legal limit. However, no kelp bass can be taken out of MPAs, and we would expect the number of large fish over 30cm to increase.
#Create a column that identifies kelp bass as juveniles (smaller than 30 cm) and adults (over 30 cm)
bass_size_ttest2$age <- ifelse(bass_size_ttest2$SIZE < 30, "juvenile", "adult")
## Subset the data to only include adults in the dataset
bass_size_adult <- subset(bass_size_ttest2, bass_size_ttest2$age=="adult")
## Visualize the data using a histogram to check distribution of data
ggplot(bass_size_adult, aes(x=SIZE))+
geom_histogram(binwidth = 2, color="black", fill="white")+
labs(x="Size (cm)")+
ggtitle("Size distribution of adult kelp bass inside MPAs")
bass_adult_count = bass_size_adult %>% group_by(YEAR, status, SP_CODE) %>%
summarize_at(vars(COUNT), funs(length))
## Lets visualize this data using a histogram
ggplot(bass_adult_count, aes(x=COUNT))+
geom_histogram(binwidth = 5, color="black", fill="white")+
labs(x="Number of adult kelp bass")+
ggtitle("Distribution of the number of adult kelp bass")
bass_adult_count$logCount <- log(bass_adult_count$COUNT)
## check normality
ggplot(bass_adult_count, aes(x=logCount))+
geom_histogram(binwidth = .6, color="black", fill="white")+
labs(x="Number of ln(adult kelp bass)")+
ggtitle("Distribution of the ln(number of adult kelp bass)")
qqPlot(bass_adult_count$logCount)
## [1] 17 5
# shapiro.test(bass_adult_count$logCount)
The shapiro test returns a p value greater than 0.05. We cannot reject the null hypothesis that the data is normally distributed. We move forward with our analysis.
## Split the data into pre/post mpa treatments
PRE_bass_adult_count <- subset(bass_adult_count, bass_adult_count$status=="PRE")
POST_bass_adult_count <- subset(bass_adult_count, bass_adult_count$status=="POST")
## Shapiro test on log transformed data
shapiro.test(PRE_bass_adult_count$logCount)
##
## Shapiro-Wilk normality test
##
## data: PRE_bass_adult_count$logCount
## W = 0.93003, p-value = 0.4816
shapiro.test(POST_bass_adult_count$logCount)
##
## Shapiro-Wilk normality test
##
## data: POST_bass_adult_count$logCount
## W = 0.9441, p-value = 0.6518
## Data is normal, so I'm using an F test!
var.test(PRE_bass_adult_count$logCount, POST_bass_adult_count$logCount)
##
## F test to compare two variances
##
## data: PRE_bass_adult_count$logCount and POST_bass_adult_count$logCount
## F = 0.52777, num df = 8, denom df = 7, p-value = 0.3897
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.1077233 2.3900532
## sample estimates:
## ratio of variances
## 0.5277731
## variances are equal. Moving forward with a standard t test
t.test(PRE_bass_adult_count$logCount, POST_bass_adult_count$logCount)
##
## Welch Two Sample t-test
##
## data: PRE_bass_adult_count$logCount and POST_bass_adult_count$logCount
## t = -2.5607, df = 12.669, p-value = 0.0241
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.6968498 -0.1416353
## sample estimates:
## mean of x mean of y
## 1.718063 2.637306
Since the “count” data was log transformed we need to convert it back to do analysis
exp(1)^1.718
## [1] 5.573371
exp(1)^2.637
## [1] 13.97123
## Compare the number of kelp bass over 30 cm pre and post MPA establishment
str(bass_adult_count)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 17 obs. of 5 variables:
## $ YEAR : int 2000 2001 2002 2005 2006 2007 2008 2010 2011 2012 ...
## $ status : chr "PRE" "PRE" "PRE" "PRE" ...
## $ SP_CODE : Factor w/ 65 levels "AHOL","ANDA",..: 37 37 37 37 37 37 37 37 37 37 ...
## $ COUNT : int 8 6 5 4 2 4 13 4 13 12 ...
## $ logCount: num 2.079 1.792 1.609 1.386 0.693 ...
## - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame': 17 obs. of 3 variables:
## ..$ YEAR : int 2000 2001 2002 2005 2006 2007 2008 2010 2011 2012 ...
## ..$ status: chr "PRE" "PRE" "PRE" "PRE" ...
## ..$ .rows :List of 17
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## ..- attr(*, ".drop")= logi TRUE
bass_adult_count$status <- factor(bass_adult_count$status, levels=unique(bass_adult_count$status))
ggplot(bass_adult_count, aes(x=status, y=COUNT, fill=status))+
geom_boxplot()+
labs(x="MPA Status")+
ylim(0,30)+
theme_classic()
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).