#check how the data set looks
head(stockC)
## Fishing_id Location Year season Gillnet_category Net.mesh.gr Net_length_Or
## 1 1 Middle 2014 spring capron small 1
## 2 1 Middle 2014 spring capron small 1
## 3 1 Middle 2014 spring capron small 1
## 4 1 Middle 2014 spring capron small 1
## 5 1 Middle 2014 spring capron small 1
## 6 1 Middle 2014 spring capron small 1
## Soak_time_Or Catch_g
## 1 1 0
## 2 1 0
## 3 1 96
## 4 1 0
## 5 1 0
## 6 1 0
str(stockC)
## 'data.frame': 2296 obs. of 9 variables:
## $ Fishing_id : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Location : chr "Middle" "Middle" "Middle" "Middle" ...
## $ Year : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ season : chr "spring" "spring" "spring" "spring" ...
## $ Gillnet_category: chr "capron" "capron" "capron" "capron" ...
## $ Net.mesh.gr : chr "small" "small" "small" "small" ...
## $ Net_length_Or : int 1 1 1 1 1 1 2 2 2 2 ...
## $ Soak_time_Or : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Catch_g : num 0 0 96 0 0 0 0 0 0 0 ...
#in this dataset, net length and soak time are indicated as ordered factors, so make sure they are treated like that. Also, Year must be a factor
stockC$Year<-as.factor(stockC$Year)
stockC$Soak_time_Or<-as.ordered(stockC$Soak_time_Or)
stockC$Net_length_Or<-as.ordered(stockC$Net_length_Or)
stockC$season <- as.factor(stockC$season)
#which seasons do we have
unique(stockC$season)
## [1] spring summer fall
## Levels: fall spring summer
#balance among seasons
table(stockC$season)
##
## fall spring summer
## 351 833 1112
#how many locations
unique(stockC$Location)
## [1] "Middle" "Upstream" "Downstream" NA
#balance among locations
table(stockC$Location)
##
## Downstream Middle Upstream
## 26 1233 1034
#gillnet categories and balance
unique(stockC$Gillnet_category)
## [1] "capron" "nylon" "other"
table(stockC$Gillnet_category)
##
## capron nylon other
## 1079 1210 7
#probably we should remove category "other" in gillnets
stockC<-stockC %>% filter (Gillnet_category!="other")
We will use generalized linear models with the distribution family tweedie, which allows many zeros in the distribution. Here we build the model with all factors included and see if we can drop some factors
modC1<- glm(Catch_g~Year + season + Location + Gillnet_category + Net.mesh.gr + Net_length_Or +Soak_time_Or, family=tweedie(var.power=1.1, link.power=0), data=stockC)
summary(modC1)
##
## Call:
## glm(formula = Catch_g ~ Year + season + Location + Gillnet_category +
## Net.mesh.gr + Net_length_Or + Soak_time_Or, family = tweedie(var.power = 1.1,
## link.power = 0), data = stockC)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -190.729 -31.237 -8.590 -3.532 264.681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.56638 2.15233 3.051 0.002309 **
## Year1992 -2.02409 0.21846 -9.265 < 2e-16 ***
## Year1993 -1.34608 0.19184 -7.017 3.00e-12 ***
## Year1994 -2.27349 0.29266 -7.768 1.20e-14 ***
## Year1995 -1.62397 0.19564 -8.301 < 2e-16 ***
## Year1996 -0.56539 0.26576 -2.127 0.033496 *
## Year1997 -1.30138 0.27689 -4.700 2.76e-06 ***
## Year1998 -1.47752 0.60221 -2.453 0.014224 *
## Year1999 -2.64934 0.26765 -9.898 < 2e-16 ***
## Year2000 -2.44237 0.30916 -7.900 4.33e-15 ***
## Year2001 -4.33690 1.19796 -3.620 0.000301 ***
## Year2002 -2.55302 0.29924 -8.532 < 2e-16 ***
## Year2003 -1.75940 0.24607 -7.150 1.17e-12 ***
## Year2004 -3.25933 0.39188 -8.317 < 2e-16 ***
## Year2005 -3.15205 0.44637 -7.061 2.19e-12 ***
## Year2006 -5.06887 1.94041 -2.612 0.009054 **
## Year2007 -4.13477 0.69318 -5.965 2.84e-09 ***
## Year2008 -3.49185 0.39847 -8.763 < 2e-16 ***
## Year2009 -6.54723 2.59571 -2.522 0.011727 *
## Year2010 -3.27110 0.85260 -3.837 0.000128 ***
## Year2011 -5.91710 1.93087 -3.064 0.002206 **
## Year2012 -4.35357 0.55442 -7.852 6.27e-15 ***
## Year2013 -4.13633 0.47565 -8.696 < 2e-16 ***
## Year2014 -4.52713 0.28749 -15.747 < 2e-16 ***
## Year2015 -4.06171 0.32880 -12.353 < 2e-16 ***
## Year2016 -3.17572 0.28116 -11.295 < 2e-16 ***
## Year2017 -4.01908 0.24690 -16.278 < 2e-16 ***
## Year2018 -3.34323 0.25499 -13.111 < 2e-16 ***
## Year2019 -3.96988 0.30720 -12.923 < 2e-16 ***
## Year2020 -3.17021 0.27249 -11.634 < 2e-16 ***
## Year2021 -3.20703 0.28225 -11.362 < 2e-16 ***
## seasonspring 0.45110 0.16231 2.779 0.005494 **
## seasonsummer -0.13658 0.13214 -1.034 0.301421
## LocationMiddle 3.47547 2.14144 1.623 0.104738
## LocationUpstream 3.25628 2.13955 1.522 0.128165
## Gillnet_categorynylon 0.30745 0.12268 2.506 0.012277 *
## Net.mesh.grfull -2.17180 0.59549 -3.647 0.000271 ***
## Net.mesh.grsmall -2.09590 0.37860 -5.536 3.46e-08 ***
## Net_length_Or.L 1.02684 0.45385 2.263 0.023761 *
## Net_length_Or.Q -1.23429 0.36691 -3.364 0.000781 ***
## Net_length_Or.C 0.05849 0.21704 0.269 0.787575
## Net_length_Or^4 -0.02982 0.11461 -0.260 0.794718
## Soak_time_Or.L -0.02503 0.09397 -0.266 0.790008
## Soak_time_Or.Q 0.40939 0.29936 1.368 0.171584
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Tweedie family taken to be 2626.631)
##
## Null deviance: 8267841 on 2286 degrees of freedom
## Residual deviance: 3412680 on 2243 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: NA
##
## Number of Fisher Scoring iterations: 9
#Can we drop anything? I.e. is model without certain variable significantly different from the one that includes it?
drop1(modC1, test = "F")
## Single term deletions
##
## Model:
## Catch_g ~ Year + season + Location + Gillnet_category + Net.mesh.gr +
## Net_length_Or + Soak_time_Or
## Df Deviance F value Pr(>F)
## <none> 3412680
## Year 30 4877582 32.0938 < 2.2e-16 ***
## season 2 3463962 16.8525 5.439e-08 ***
## Location 2 3437273 8.0818 0.0003182 ***
## Gillnet_category 1 3429358 10.9612 0.0009451 ***
## Net.mesh.gr 2 3603231 62.6202 < 2.2e-16 ***
## Net_length_Or 4 3501726 14.6314 8.403e-12 ***
## Soak_time_Or 2 3418146 1.7963 0.1661473
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#yes, soak time is not significant
#Get AIC value of the model
AICtweedie(modC1)
## [1] 21002.13
#How much variance does this model explain (in %)?
modEvA::Dsquared(modC1) *100
## [1] 58.72344
Soak time is not significant, so we will run a model without it and compare using AIC
modC2<- glm(Catch_g~Year + season + Location + Gillnet_category + Net.mesh.gr + Net_length_Or, family=tweedie(var.power=1.1, link.power=0), data=stockC)
summary(modC2)
##
## Call:
## glm(formula = Catch_g ~ Year + season + Location + Gillnet_category +
## Net.mesh.gr + Net_length_Or, family = tweedie(var.power = 1.1,
## link.power = 0), data = stockC)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -192.061 -31.289 -8.716 -3.506 267.393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.69431 2.15070 3.113 0.001878 **
## Year1992 -2.02577 0.21849 -9.272 < 2e-16 ***
## Year1993 -1.37024 0.19123 -7.166 1.05e-12 ***
## Year1994 -2.27573 0.29231 -7.785 1.05e-14 ***
## Year1995 -1.61607 0.19560 -8.262 2.41e-16 ***
## Year1996 -0.56401 0.26436 -2.134 0.032992 *
## Year1997 -1.28799 0.27685 -4.652 3.47e-06 ***
## Year1998 -1.99196 0.47597 -4.185 2.96e-05 ***
## Year1999 -2.68313 0.26119 -10.273 < 2e-16 ***
## Year2000 -2.45445 0.30445 -8.062 1.21e-15 ***
## Year2001 -4.34204 1.19921 -3.621 0.000300 ***
## Year2002 -2.55788 0.29420 -8.694 < 2e-16 ***
## Year2003 -1.76295 0.24441 -7.213 7.45e-13 ***
## Year2004 -3.24925 0.39127 -8.304 < 2e-16 ***
## Year2005 -3.15464 0.44574 -7.077 1.96e-12 ***
## Year2006 -5.07667 1.94157 -2.615 0.008990 **
## Year2007 -4.15134 0.68927 -6.023 2.00e-09 ***
## Year2008 -3.49539 0.39845 -8.773 < 2e-16 ***
## Year2009 -6.54048 2.59931 -2.516 0.011931 *
## Year2010 -3.27361 0.85337 -3.836 0.000128 ***
## Year2011 -5.91998 1.93276 -3.063 0.002218 **
## Year2012 -4.34787 0.55478 -7.837 7.06e-15 ***
## Year2013 -4.11711 0.47347 -8.696 < 2e-16 ***
## Year2014 -4.51562 0.28500 -15.844 < 2e-16 ***
## Year2015 -4.05112 0.32753 -12.369 < 2e-16 ***
## Year2016 -3.18036 0.28071 -11.330 < 2e-16 ***
## Year2017 -4.01082 0.24590 -16.311 < 2e-16 ***
## Year2018 -3.33918 0.25502 -13.094 < 2e-16 ***
## Year2019 -3.96459 0.30721 -12.905 < 2e-16 ***
## Year2020 -3.16691 0.27283 -11.608 < 2e-16 ***
## Year2021 -3.20159 0.28220 -11.345 < 2e-16 ***
## seasonspring 0.47465 0.12725 3.730 0.000196 ***
## seasonsummer -0.13181 0.13109 -1.005 0.314794
## LocationMiddle 3.46678 2.14319 1.618 0.105894
## LocationUpstream 3.25472 2.14140 1.520 0.128675
## Gillnet_categorynylon 0.30602 0.12279 2.492 0.012767 *
## Net.mesh.grfull -2.16351 0.59578 -3.631 0.000288 ***
## Net.mesh.grsmall -2.11404 0.37812 -5.591 2.53e-08 ***
## Net_length_Or.L 0.91549 0.44917 2.038 0.041648 *
## Net_length_Or.Q -1.30913 0.36465 -3.590 0.000338 ***
## Net_length_Or.C 0.01269 0.21551 0.059 0.953062
## Net_length_Or^4 -0.03656 0.11455 -0.319 0.749649
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Tweedie family taken to be 2631.836)
##
## Null deviance: 8267841 on 2286 degrees of freedom
## Residual deviance: 3418146 on 2245 degrees of freedom
## (2 observations deleted due to missingness)
## AIC: NA
##
## Number of Fisher Scoring iterations: 9
#Get the AIC value
AICtweedie(modC2)
## [1] 21008.01
#Look at explained variance
modEvA::Dsquared(modC2) *100
## [1] 58.65733
#Compare the two models with AIC
AICtweedie(modC2)
## [1] 21008.01
AICtweedie(modC1)
## [1] 21002.13
We find that AIC is the lower in the first model (modC1) so we consider it the best model
For this we will used a function scaleCE that will scale the CPUE values against the first Year
tt <- summary(modC1)$coefficients # combine these with empty first Year
tt2 <- rbind(c(0,0,0,0), tt[grep("Year",rownames(tt)),])
tt3 <- cbind(tt2, exp(tt2[,"Estimate"]), scaleCE(exp(tt2[,"Estimate"])))
yrs<- c(1991:2021)
cpue_stand <- cbind(tt3, yrs)
colnames(cpue_stand) <- c( "Estimate", "StdError", "t value" ,"Pr(>|t|)" , "Exp_estimate" ,"Scaled_estimate", "Year")
rm(tt, tt2, tt3)
#write our model coefficients into a cpue_stand dataframe
cpue_stand <- as.data.frame(cpue_stand)
#Plotting using base R graphics
plot(cpue_stand$Year, cpue_stand$Scaled_estimate, ylab="Standardised CPUE", main="Stock C", xlab="Year",pch= 19)
#Or plotting using ggplot
ggplot(cpue_stand, aes(Year, Scaled_estimate)) +
geom_point(data = cpue_stand, aes(x = Year, y = Scaled_estimate), size=2) +
#geom_errorbar(data = cpue_stand, aes(ymin = Scaled_estimate, ymax = Scaled_estimate+StdError)) +
ylab("Standardised CPUE") +
xlab("") +
ggtitle("") +
geom_smooth(method = "loess") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=14))
## `geom_smooth()` using formula = 'y ~ x'
#save
#write.csv(cpue_stand, "cpue_standStockC1.csv")
#plot model coefficients
plot_model(modC1)
#plot effects - you can run it locally as well, but we will not run it here
#plot(allEffects(modC1))
#Check residuals and outliers, try again without them if needed
#model diagnostics
autoplot(modC1, which = 1:6, label.size = 3)+
theme_bw()
#outliers- quantitative detection
#leverage distance
# calculate the influence of data points
i_n = influence(modC1)$hat
#shows which data point has the highest influence on the fitted model
which.max(i_n)
## 1658
## 1658
#Look at these values
stockC[1658,]
## Fishing_id Location Year season Gillnet_category Net.mesh.gr Net_length_Or
## 1658 208 Upstream 1998 spring nylon big 3
## Soak_time_Or Catch_g
## 1658 2 21072
#Cook’s distance.
c_d = cooks.distance(modC1)
##shows which data point has the highest influence on the fitted model
which.max(c_d)
## 1699
## 1699
#Look at these values
stockC[1699,]
## Fishing_id Location Year season Gillnet_category Net.mesh.gr Net_length_Or
## 1699 244 Upstream 1996 fall nylon big 4
## Soak_time_Or Catch_g
## 1699 3 130933
#remove outlier from the dataset, and check how results have changed
stockC<-stockC[-1699,]