Question: Does the protection of kelp bass from fishing through the creation of MPAs affect their average size?

Reading in data and displaying first six lines

fishData <- read.csv("Annual_fish_comb_20200108.csv")
head(fishData)

##   YEAR MONTH      DATE SITE TRANSECT   QUAD   SIDE    VIS SP_CODE   SIZE COUNT
## 1 2000    10 10/4/2000 NAPL        1 -99999 -99999 -99999    ANDA -99999     0
## 2 2000    10 10/4/2000 NAPL        2 -99999 -99999 -99999    ANDA -99999     0
## 3 2000    10 10/4/2000 NAPL        3 -99999 -99999 -99999    ANDA -99999     0
## 4 2000    10 10/4/2000 NAPL        4 -99999 -99999 -99999    ANDA -99999     0
## 5 2000    10 10/4/2000 NAPL        5 -99999 -99999 -99999    ANDA -99999     0
## 6 2000    10 10/4/2000 NAPL        6 -99999 -99999 -99999    ANDA -99999     0
##   AREA        SCIENTIFIC_NAME COMMON_NAME TAXON_KINGDOM TAXON_PHYLUM
## 1   80 Anisotremus davidsonii       Sargo      Animalia     Chordata
## 2   80 Anisotremus davidsonii       Sargo      Animalia     Chordata
## 3   80 Anisotremus davidsonii       Sargo      Animalia     Chordata
## 4   80 Anisotremus davidsonii       Sargo      Animalia     Chordata
## 5   80 Anisotremus davidsonii       Sargo      Animalia     Chordata
## 6   80 Anisotremus davidsonii       Sargo      Animalia     Chordata
##      TAXON_CLASS TAXON_ORDER TAXON_FAMILY TAXON_GENUS GROUP SURVEY MOBILITY
## 1 Actinopterygii Perciformes   Haemulidae Anisotremus  FISH   FISH   MOBILE
## 2 Actinopterygii Perciformes   Haemulidae Anisotremus  FISH   FISH   MOBILE
## 3 Actinopterygii Perciformes   Haemulidae Anisotremus  FISH   FISH   MOBILE
## 4 Actinopterygii Perciformes   Haemulidae Anisotremus  FISH   FISH   MOBILE
## 5 Actinopterygii Perciformes   Haemulidae Anisotremus  FISH   FISH   MOBILE
## 6 Actinopterygii Perciformes   Haemulidae Anisotremus  FISH   FISH   MOBILE
##   GROWTH_MORPH
## 1     SOLITARY
## 2     SOLITARY
## 3     SOLITARY
## 4     SOLITARY
## 5     SOLITARY
## 6     SOLITARY

Subsetting the data to only include kelp bass

## Subset data to only include kelp bass
bass1 <- subset(fishData, SP_CODE=="PCLA")

## Remove all -99999 from dataset
bass2 = bass1[-which(bass1$SIZE < 0),]

## Include only the columns of interest to simplify the dataset
bass3 = bass2[c("YEAR", "SITE", "SP_CODE", "SIZE", "COUNT")]

## Replicates the rows based on the values of the "COUNT" column
index <- rep(1:nrow(bass3), bass3$COUNT)
bass_size <- bass3[index, ]

##Calculate the average size of the kelp bass at each YEAR and SITE
head(bass_size)

##         YEAR SITE SP_CODE SIZE COUNT
## 23232   2000 NAPL    PCLA   15     1
## 23233   2000 NAPL    PCLA   18     1
## 23234   2000 NAPL    PCLA   20     1
## 23236   2000 NAPL    PCLA   30     3
## 23236.1 2000 NAPL    PCLA   30     3
## 23236.2 2000 NAPL    PCLA   30     3

bass_avg = bass_size %>% group_by(YEAR, SITE, SP_CODE) %>% 
  summarize_at(vars(SIZE), funs(mean))

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.

##Plot average size of kelp bass by site by year 
## The colors in each site give you a rough idea of the relative average size between sites in a given year 
ggplot(bass_avg, aes(x=YEAR, y=SIZE))+
  geom_col(aes(fill=SITE))+
  labs(y="Total size across all sites")+
  ggtitle("Average size of bass across sites and years")

Data wrangling and visualization section

## Filter out all sites except MOHK, CARP (non MPAs) and IVEE and NAPLES (MPAs)
bass_comp = bass_avg[which(bass_avg$SITE %in% c("MOHK", "CARP", "IVEE", "NAPL")),]

##Make a column that outputs "CONTROL" if CARP or MOHK. Will output "MPA" if IVEE or NAPL. 
bass_comp$status <- ifelse(bass_comp$SITE == "MOHK" | bass_comp$SITE=="CARP", "CONTROL", "MPA")

##Group by MPA status and year. Get the mean of the two different states, by year 
bass_comp_avg = bass_comp %>% group_by(YEAR, status, SP_CODE) %>% 
  summarize_at(vars(SIZE), funs(mean))

##Make a grouped column chart that shows avg bass size for each year inside and outside of MPAs 
## *Note: MPA status starts in 2012. 
# ggplot(bass_comp_avg, aes(fill=status, x=YEAR, y=SIZE))+
#   geom_bar(position="stack", stat="identity")+
#   ggtitle("Average kelp bass size inside and outside MPAs")


##Box plot for sites
bass_size_post <- bass_size[which(bass_size$YEAR > 2011), ]
bass_size_post$status <- ifelse(bass_size_post$SITE == "NAPL" | bass_size_post$SITE=="IVEE", "MPA", "CONTROL")
bass_size_post <- bass_size_post[-which(bass_size_post$SITE=="SCDI" | bass_size_post$SITE=="SCTW"),]

ggplot(bass_size_post, aes(x=status, y=SIZE, fill=status))+
  geom_boxplot()+
  labs(x="MPA Status", y="Average size (cm)")+
  theme_classic()

Median kelp bass size across years. “Control” sites are the seven mainland sites that were not designated as MPA sites. Only Naples and Isla Vista reef make up the MPA treatment

# str(bass_comp_avg)
##Subset the data further to only include post mpa years for inside/outside 
bass_comp_post <- bass_comp_avg[which(bass_comp_avg$YEAR > 2012), ]

# ## Remake the grouped column chart with the subsetted years 
ggplot(bass_comp_post, aes(x=YEAR, y=SIZE, fill=status))+
  geom_bar(position="dodge", stat="identity")+
  labs(y="Average size (cm)")+
  ggtitle("Average kelp bass size inside and outside of MPAs")

This graph shows the same data as the boxplot above, but displays the average kelp bass size inside and outside MPAs by year.
Note that MPAs were created in 2012. We would expect to see the average size of the kelp bass inside the MPAs to increase. However, this does not seem to be the case.

Statistical analysis of kelp bass size inside and outside MPAs

 ##This is the averaged data analysis
bass_comp_ttest = bass_comp[which(bass_comp$SITE %in% c("IVEE", "NAPL")),]

#subsetting the data into Pre MPA and Post MPA
pre_mpa = bass_comp_ttest[which(bass_comp_ttest$YEAR %in% c("2000", "2001", "2002", "2003", "2004", "2005", "2006", "2007", "2008", "2009", "2010", "2011")),]
post_mpa = bass_comp_ttest[which(bass_comp_ttest$YEAR %in% c("2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019")),]

## Boxplot for pre and post average sizes
bass_comp_ttest$status <- ifelse(bass_comp_ttest$YEAR <2012, "PRE", "POST")
ggplot(bass_comp_ttest, aes(x=status, y=SIZE, fill=status))+
  geom_boxplot()+
  ggtitle("Kelp bass size pre and post MPA creation")+
  theme(plot.title = element_text(hjust=0.5))

##First, let’s check to see if the bass sizes are normal

#Looking to see if the data is normal
shapiro.test(pre_mpa$SIZE)

## 
##  Shapiro-Wilk normality test
## 
## data:  pre_mpa$SIZE
## W = 0.92356, p-value = 0.116

qqPlot(pre_mpa$SIZE, main = "Pre MPA qqPlot", ylab = "Fish Size Data")

## [1] 16  8

ggplot(pre_mpa, aes(x=SIZE)) + 
  geom_histogram(binwidth=4, color="black", fill="white")+
  ggtitle("Histogram for pre-mpa fish sizes")

shapiro.test(post_mpa$SIZE)

## 
##  Shapiro-Wilk normality test
## 
## data:  post_mpa$SIZE
## W = 0.96447, p-value = 0.7694

qqPlot(post_mpa$SIZE, main = "Post MPA qqPlot", ylab = "Fish Size Data")

## [1] 8 2

ggplot(post_mpa, aes(x=SIZE)) + 
  geom_histogram(binwidth=3, color="black", fill="white")+
  ggtitle("Post-mpa kelp bass sizes: Histogram")

The data for Pre and Post MPA fall under the CI for normality for the qqplots
The p values are not smaller than 0.05 for Pre or Post MPA data, we fail to reject that data is normal

Next, lets test for equal variances

#Because data appears to be normal an F-test is conducted to see if variances are equal
var.test(pre_mpa$SIZE, post_mpa$SIZE)

## 
##  F test to compare two variances
## 
## data:  pre_mpa$SIZE and post_mpa$SIZE
## F = 0.86256, num df = 19, denom df = 14, p-value = 0.75
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3015197 2.2831444
## sample estimates:
## ratio of variances 
##          0.8625639

#The F test has a p-value greater than 0.05, we fail to reject that variances are equal


#A two-sample t-test is conducted assuming equal variance to compare the difference in means
t.test(pre_mpa$SIZE, post_mpa$SIZE, var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  pre_mpa$SIZE and post_mpa$SIZE
## t = 0.48795, df = 33, p-value = 0.6288
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.810393  6.214798
## sample estimates:
## mean of x mean of y 
##  25.84527  24.64307

Null hypothesis: Size does not change between MPA statuses (the difference in means is 0).
Alternative hypothesis: Size does change between MPA statuses (the difference in means is not 0)
The p-value is greater than 0.05, we fail to reject that the difference in means is 0

There is no significant difference in means for Pre and Post MPA kelp bass sizes

# ##Subset the raw data to only include the sites IVEE and NAPL
bass_size_ttest2 = bass_size[which(bass_size$SITE %in% c("IVEE", "NAPL")),]


## First, we wrangle the data and create a "status" column in the dataset for pre/post mpa
bass_size_ttest2$status <- ifelse(bass_size_ttest2$YEAR <2012, "PRE", "POST")

ADDITIONAL ANALYSIS

Our initial analysis suggested that the average size of the kelp bass did not change after MPAs were established. This makes it appear that MPAs had no effect on kelp bass size. However, we wanted to do some additional testing and look at the number of kelp bass before and after MPA zones were established.

Null hypothesis: There is no difference in the average number of kelp bass over 30 cm before and afer MPA zones were established in sites IVEE and NAPL.
Alternative hypothesis: There IS a difference in the average number of kelp bass over 30 cm before and after MPA zones were established.

Only kelp bass over 14 inches (34 cm) can be eaten by fishermen. Therefore, there should be few kelp bass over this legal limit. However, no kelp bass can be taken out of MPAs, and we would expect the number of large fish over 30cm to increase.

#Create a column that identifies kelp bass as juveniles (smaller than 30 cm) and adults (over 30 cm)
bass_size_ttest2$age <- ifelse(bass_size_ttest2$SIZE < 30, "juvenile", "adult")

## Subset the data to only include adults in the dataset 
bass_size_adult <- subset(bass_size_ttest2, bass_size_ttest2$age=="adult")

## Visualize the data using a histogram to check distribution of data 
ggplot(bass_size_adult, aes(x=SIZE))+
  geom_histogram(binwidth = 2, color="black", fill="white")+
  labs(x="Size (cm)")+
  ggtitle("Size distribution of adult kelp bass inside MPAs")

Size distribution is heavily right skewed. However, since we only want the NUMBER of adults, the size distribution is irrelevant.

bass_adult_count = bass_size_adult %>% group_by(YEAR, status, SP_CODE) %>% 
  summarize_at(vars(COUNT), funs(length))

## Lets visualize this data using a histogram 
ggplot(bass_adult_count, aes(x=COUNT))+
  geom_histogram(binwidth = 5, color="black", fill="white")+
  labs(x="Number of adult kelp bass")+
  ggtitle("Distribution of the number of adult kelp bass")

The data is heavily right skewed. We must perform a log transformation on the “count” variable.

bass_adult_count$logCount <- log(bass_adult_count$COUNT)

## check normality 
ggplot(bass_adult_count, aes(x=logCount))+
  geom_histogram(binwidth = .6, color="black", fill="white")+
  labs(x="Number of ln(adult kelp bass)")+
  ggtitle("Distribution of the ln(number of adult kelp bass)")

qqPlot(bass_adult_count$logCount)

## [1] 17  5

# shapiro.test(bass_adult_count$logCount)

The shapiro test returns a p value greater than 0.05. We cannot reject the null hypothesis that the data is normally distributed. We move forward with our analysis.

## Split the data into pre/post mpa treatments
PRE_bass_adult_count <- subset(bass_adult_count, bass_adult_count$status=="PRE")
POST_bass_adult_count <- subset(bass_adult_count, bass_adult_count$status=="POST")

## Shapiro test on log transformed data 
shapiro.test(PRE_bass_adult_count$logCount)

## 
##  Shapiro-Wilk normality test
## 
## data:  PRE_bass_adult_count$logCount
## W = 0.93003, p-value = 0.4816

shapiro.test(POST_bass_adult_count$logCount)

## 
##  Shapiro-Wilk normality test
## 
## data:  POST_bass_adult_count$logCount
## W = 0.9441, p-value = 0.6518

## Data is normal, so I'm using an F test!
var.test(PRE_bass_adult_count$logCount, POST_bass_adult_count$logCount)

## 
##  F test to compare two variances
## 
## data:  PRE_bass_adult_count$logCount and POST_bass_adult_count$logCount
## F = 0.52777, num df = 8, denom df = 7, p-value = 0.3897
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.1077233 2.3900532
## sample estimates:
## ratio of variances 
##          0.5277731

## variances are equal. Moving forward with a standard t test 
t.test(PRE_bass_adult_count$logCount, POST_bass_adult_count$logCount)

## 
##  Welch Two Sample t-test
## 
## data:  PRE_bass_adult_count$logCount and POST_bass_adult_count$logCount
## t = -2.5607, df = 12.669, p-value = 0.0241
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.6968498 -0.1416353
## sample estimates:
## mean of x mean of y 
##  1.718063  2.637306

Since the “count” data was log transformed we need to convert it back to do analysis

exp(1)^1.718

## [1] 5.573371

exp(1)^2.637

## [1] 13.97123

Based on the results of the t test there IS a difference in the number of adults before and after the creation of the MPA. The number of adults increases from an average of 5.5 to an average of 14.

## Compare the number of kelp bass over 30 cm pre and post MPA establishment 

str(bass_adult_count)

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  17 obs. of  5 variables:
##  $ YEAR    : int  2000 2001 2002 2005 2006 2007 2008 2010 2011 2012 ...
##  $ status  : chr  "PRE" "PRE" "PRE" "PRE" ...
##  $ SP_CODE : Factor w/ 65 levels "AHOL","ANDA",..: 37 37 37 37 37 37 37 37 37 37 ...
##  $ COUNT   : int  8 6 5 4 2 4 13 4 13 12 ...
##  $ logCount: num  2.079 1.792 1.609 1.386 0.693 ...
##  - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame':   17 obs. of  3 variables:
##   ..$ YEAR  : int  2000 2001 2002 2005 2006 2007 2008 2010 2011 2012 ...
##   ..$ status: chr  "PRE" "PRE" "PRE" "PRE" ...
##   ..$ .rows :List of 17
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   ..- attr(*, ".drop")= logi TRUE

bass_adult_count$status <- factor(bass_adult_count$status, levels=unique(bass_adult_count$status))


ggplot(bass_adult_count, aes(x=status, y=COUNT, fill=status))+
geom_boxplot()+
labs(x="MPA Status")+
ylim(0,30)+
theme_classic()