Data exploration

#check how the data set looks
head(stockC)

##   Fishing_id Location Year season Gillnet_category Net.mesh.gr Net_length_Or
## 1          1   Middle 2014 spring           capron       small             1
## 2          1   Middle 2014 spring           capron       small             1
## 3          1   Middle 2014 spring           capron       small             1
## 4          1   Middle 2014 spring           capron       small             1
## 5          1   Middle 2014 spring           capron       small             1
## 6          1   Middle 2014 spring           capron       small             1
##   Soak_time_Or Catch_g
## 1            1       0
## 2            1       0
## 3            1      96
## 4            1       0
## 5            1       0
## 6            1       0

str(stockC)

## 'data.frame':    2296 obs. of  9 variables:
##  $ Fishing_id      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Location        : chr  "Middle" "Middle" "Middle" "Middle" ...
##  $ Year            : int  2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
##  $ season          : chr  "spring" "spring" "spring" "spring" ...
##  $ Gillnet_category: chr  "capron" "capron" "capron" "capron" ...
##  $ Net.mesh.gr     : chr  "small" "small" "small" "small" ...
##  $ Net_length_Or   : int  1 1 1 1 1 1 2 2 2 2 ...
##  $ Soak_time_Or    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Catch_g         : num  0 0 96 0 0 0 0 0 0 0 ...

#in this dataset, net length and soak time are indicated as ordered factors, so make sure they are treated like that. Also, Year must be a factor
stockC$Year<-as.factor(stockC$Year)
stockC$Soak_time_Or<-as.ordered(stockC$Soak_time_Or)
stockC$Net_length_Or<-as.ordered(stockC$Net_length_Or)
stockC$season <- as.factor(stockC$season)

#which seasons do we have
unique(stockC$season)

## [1] spring summer fall  
## Levels: fall spring summer

#balance among seasons 
table(stockC$season)

## 
##   fall spring summer 
##    351    833   1112

#how many locations
unique(stockC$Location)

## [1] "Middle"     "Upstream"   "Downstream" NA

#balance among locations
table(stockC$Location)

## 
## Downstream     Middle   Upstream 
##         26       1233       1034

#gillnet categories and balance 
unique(stockC$Gillnet_category)

## [1] "capron" "nylon"  "other"

table(stockC$Gillnet_category)

## 
## capron  nylon  other 
##   1079   1210      7

#probably we should remove category "other" in gillnets
stockC<-stockC %>% filter (Gillnet_category!="other")

MODEL

Get the best model

We will use generalized linear models with the distribution family tweedie, which allows many zeros in the distribution. Here we build the model with all factors included and see if we can drop some factors

modC1<- glm(Catch_g~Year + season + Location + Gillnet_category + Net.mesh.gr + Net_length_Or +Soak_time_Or, family=tweedie(var.power=1.1, link.power=0), data=stockC)
summary(modC1)

## 
## Call:
## glm(formula = Catch_g ~ Year + season + Location + Gillnet_category + 
##     Net.mesh.gr + Net_length_Or + Soak_time_Or, family = tweedie(var.power = 1.1, 
##     link.power = 0), data = stockC)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -190.729   -31.237    -8.590    -3.532   264.681  
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.56638    2.15233   3.051 0.002309 ** 
## Year1992              -2.02409    0.21846  -9.265  < 2e-16 ***
## Year1993              -1.34608    0.19184  -7.017 3.00e-12 ***
## Year1994              -2.27349    0.29266  -7.768 1.20e-14 ***
## Year1995              -1.62397    0.19564  -8.301  < 2e-16 ***
## Year1996              -0.56539    0.26576  -2.127 0.033496 *  
## Year1997              -1.30138    0.27689  -4.700 2.76e-06 ***
## Year1998              -1.47752    0.60221  -2.453 0.014224 *  
## Year1999              -2.64934    0.26765  -9.898  < 2e-16 ***
## Year2000              -2.44237    0.30916  -7.900 4.33e-15 ***
## Year2001              -4.33690    1.19796  -3.620 0.000301 ***
## Year2002              -2.55302    0.29924  -8.532  < 2e-16 ***
## Year2003              -1.75940    0.24607  -7.150 1.17e-12 ***
## Year2004              -3.25933    0.39188  -8.317  < 2e-16 ***
## Year2005              -3.15205    0.44637  -7.061 2.19e-12 ***
## Year2006              -5.06887    1.94041  -2.612 0.009054 ** 
## Year2007              -4.13477    0.69318  -5.965 2.84e-09 ***
## Year2008              -3.49185    0.39847  -8.763  < 2e-16 ***
## Year2009              -6.54723    2.59571  -2.522 0.011727 *  
## Year2010              -3.27110    0.85260  -3.837 0.000128 ***
## Year2011              -5.91710    1.93087  -3.064 0.002206 ** 
## Year2012              -4.35357    0.55442  -7.852 6.27e-15 ***
## Year2013              -4.13633    0.47565  -8.696  < 2e-16 ***
## Year2014              -4.52713    0.28749 -15.747  < 2e-16 ***
## Year2015              -4.06171    0.32880 -12.353  < 2e-16 ***
## Year2016              -3.17572    0.28116 -11.295  < 2e-16 ***
## Year2017              -4.01908    0.24690 -16.278  < 2e-16 ***
## Year2018              -3.34323    0.25499 -13.111  < 2e-16 ***
## Year2019              -3.96988    0.30720 -12.923  < 2e-16 ***
## Year2020              -3.17021    0.27249 -11.634  < 2e-16 ***
## Year2021              -3.20703    0.28225 -11.362  < 2e-16 ***
## seasonspring           0.45110    0.16231   2.779 0.005494 ** 
## seasonsummer          -0.13658    0.13214  -1.034 0.301421    
## LocationMiddle         3.47547    2.14144   1.623 0.104738    
## LocationUpstream       3.25628    2.13955   1.522 0.128165    
## Gillnet_categorynylon  0.30745    0.12268   2.506 0.012277 *  
## Net.mesh.grfull       -2.17180    0.59549  -3.647 0.000271 ***
## Net.mesh.grsmall      -2.09590    0.37860  -5.536 3.46e-08 ***
## Net_length_Or.L        1.02684    0.45385   2.263 0.023761 *  
## Net_length_Or.Q       -1.23429    0.36691  -3.364 0.000781 ***
## Net_length_Or.C        0.05849    0.21704   0.269 0.787575    
## Net_length_Or^4       -0.02982    0.11461  -0.260 0.794718    
## Soak_time_Or.L        -0.02503    0.09397  -0.266 0.790008    
## Soak_time_Or.Q         0.40939    0.29936   1.368 0.171584    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Tweedie family taken to be 2626.631)
## 
##     Null deviance: 8267841  on 2286  degrees of freedom
## Residual deviance: 3412680  on 2243  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 9

#Can we drop anything? I.e. is model without certain variable significantly different from the one that includes it?
drop1(modC1, test = "F")

## Single term deletions
## 
## Model:
## Catch_g ~ Year + season + Location + Gillnet_category + Net.mesh.gr + 
##     Net_length_Or + Soak_time_Or
##                  Df Deviance F value    Pr(>F)    
## <none>               3412680                      
## Year             30  4877582 32.0938 < 2.2e-16 ***
## season            2  3463962 16.8525 5.439e-08 ***
## Location          2  3437273  8.0818 0.0003182 ***
## Gillnet_category  1  3429358 10.9612 0.0009451 ***
## Net.mesh.gr       2  3603231 62.6202 < 2.2e-16 ***
## Net_length_Or     4  3501726 14.6314 8.403e-12 ***
## Soak_time_Or      2  3418146  1.7963 0.1661473    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#yes, soak time is not significant

#Get AIC value of the model 
AICtweedie(modC1)

## [1] 21002.13

#How much variance does this model explain (in %)?
modEvA::Dsquared(modC1) *100

## [1] 58.72344

Soak time is not significant, so we will run a model without it and compare using AIC

modC2<- glm(Catch_g~Year + season + Location + Gillnet_category + Net.mesh.gr + Net_length_Or,  family=tweedie(var.power=1.1, link.power=0), data=stockC)
summary(modC2)

## 
## Call:
## glm(formula = Catch_g ~ Year + season + Location + Gillnet_category + 
##     Net.mesh.gr + Net_length_Or, family = tweedie(var.power = 1.1, 
##     link.power = 0), data = stockC)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -192.061   -31.289    -8.716    -3.506   267.393  
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            6.69431    2.15070   3.113 0.001878 ** 
## Year1992              -2.02577    0.21849  -9.272  < 2e-16 ***
## Year1993              -1.37024    0.19123  -7.166 1.05e-12 ***
## Year1994              -2.27573    0.29231  -7.785 1.05e-14 ***
## Year1995              -1.61607    0.19560  -8.262 2.41e-16 ***
## Year1996              -0.56401    0.26436  -2.134 0.032992 *  
## Year1997              -1.28799    0.27685  -4.652 3.47e-06 ***
## Year1998              -1.99196    0.47597  -4.185 2.96e-05 ***
## Year1999              -2.68313    0.26119 -10.273  < 2e-16 ***
## Year2000              -2.45445    0.30445  -8.062 1.21e-15 ***
## Year2001              -4.34204    1.19921  -3.621 0.000300 ***
## Year2002              -2.55788    0.29420  -8.694  < 2e-16 ***
## Year2003              -1.76295    0.24441  -7.213 7.45e-13 ***
## Year2004              -3.24925    0.39127  -8.304  < 2e-16 ***
## Year2005              -3.15464    0.44574  -7.077 1.96e-12 ***
## Year2006              -5.07667    1.94157  -2.615 0.008990 ** 
## Year2007              -4.15134    0.68927  -6.023 2.00e-09 ***
## Year2008              -3.49539    0.39845  -8.773  < 2e-16 ***
## Year2009              -6.54048    2.59931  -2.516 0.011931 *  
## Year2010              -3.27361    0.85337  -3.836 0.000128 ***
## Year2011              -5.91998    1.93276  -3.063 0.002218 ** 
## Year2012              -4.34787    0.55478  -7.837 7.06e-15 ***
## Year2013              -4.11711    0.47347  -8.696  < 2e-16 ***
## Year2014              -4.51562    0.28500 -15.844  < 2e-16 ***
## Year2015              -4.05112    0.32753 -12.369  < 2e-16 ***
## Year2016              -3.18036    0.28071 -11.330  < 2e-16 ***
## Year2017              -4.01082    0.24590 -16.311  < 2e-16 ***
## Year2018              -3.33918    0.25502 -13.094  < 2e-16 ***
## Year2019              -3.96459    0.30721 -12.905  < 2e-16 ***
## Year2020              -3.16691    0.27283 -11.608  < 2e-16 ***
## Year2021              -3.20159    0.28220 -11.345  < 2e-16 ***
## seasonspring           0.47465    0.12725   3.730 0.000196 ***
## seasonsummer          -0.13181    0.13109  -1.005 0.314794    
## LocationMiddle         3.46678    2.14319   1.618 0.105894    
## LocationUpstream       3.25472    2.14140   1.520 0.128675    
## Gillnet_categorynylon  0.30602    0.12279   2.492 0.012767 *  
## Net.mesh.grfull       -2.16351    0.59578  -3.631 0.000288 ***
## Net.mesh.grsmall      -2.11404    0.37812  -5.591 2.53e-08 ***
## Net_length_Or.L        0.91549    0.44917   2.038 0.041648 *  
## Net_length_Or.Q       -1.30913    0.36465  -3.590 0.000338 ***
## Net_length_Or.C        0.01269    0.21551   0.059 0.953062    
## Net_length_Or^4       -0.03656    0.11455  -0.319 0.749649    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Tweedie family taken to be 2631.836)
## 
##     Null deviance: 8267841  on 2286  degrees of freedom
## Residual deviance: 3418146  on 2245  degrees of freedom
##   (2 observations deleted due to missingness)
## AIC: NA
## 
## Number of Fisher Scoring iterations: 9

#Get the AIC value
AICtweedie(modC2)

## [1] 21008.01

#Look at explained variance 
modEvA::Dsquared(modC2) *100

## [1] 58.65733

#Compare the two models with AIC
AICtweedie(modC2)

## [1] 21008.01

AICtweedie(modC1)

## [1] 21002.13

We find that AIC is the lower in the first model (modC1) so we consider it the best model

Extract model coefficients for the year

For this we will used a function scaleCE that will scale the CPUE values against the first Year

tt <- summary(modC1)$coefficients  # combine these with empty first Year
tt2 <- rbind(c(0,0,0,0), tt[grep("Year",rownames(tt)),])
tt3 <- cbind(tt2, exp(tt2[,"Estimate"]), scaleCE(exp(tt2[,"Estimate"])))
yrs<- c(1991:2021)
cpue_stand <- cbind(tt3, yrs)
colnames(cpue_stand) <- c( "Estimate", "StdError", "t value" ,"Pr(>|t|)" , "Exp_estimate" ,"Scaled_estimate", "Year")
rm(tt, tt2, tt3)
#write our model coefficients into a cpue_stand dataframe 
cpue_stand <- as.data.frame(cpue_stand)

Plot CPUE

#Plotting using base R graphics
plot(cpue_stand$Year, cpue_stand$Scaled_estimate, ylab="Standardised CPUE", main="Stock C", xlab="Year",pch= 19)

#Or plotting using ggplot
ggplot(cpue_stand, aes(Year, Scaled_estimate)) +      
  geom_point(data = cpue_stand, aes(x = Year, y = Scaled_estimate), size=2) +
  #geom_errorbar(data = cpue_stand, aes(ymin = Scaled_estimate, ymax = Scaled_estimate+StdError)) +
    ylab("Standardised CPUE") +
  xlab("") +
   ggtitle("") +
  geom_smooth(method = "loess") +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black")) +
  theme(axis.text=element_text(size=14),
        axis.title=element_text(size=14))

## `geom_smooth()` using formula = 'y ~ x'

#save
#write.csv(cpue_stand, "cpue_standStockC1.csv")

Explore model diagnostics

#plot model coefficients
plot_model(modC1)

#plot effects - you can run it locally as well, but we will not run it here
#plot(allEffects(modC1))

#Check residuals and outliers, try again without them if needed
#model diagnostics
autoplot(modC1, which = 1:6, label.size = 3)+ 
  theme_bw()

#outliers- quantitative detection

#leverage  distance
# calculate the influence of data points
i_n = influence(modC1)$hat 
 #shows which data point has the highest influence on the fitted model
which.max(i_n)

## 1658 
## 1658

#Look at these values
stockC[1658,]

##      Fishing_id Location Year season Gillnet_category Net.mesh.gr Net_length_Or
## 1658        208 Upstream 1998 spring            nylon         big             3
##      Soak_time_Or Catch_g
## 1658            2   21072

#Cook’s distance.
c_d = cooks.distance(modC1)
##shows which data point has the highest influence on the fitted model
which.max(c_d)

## 1699 
## 1699

#Look at these values
stockC[1699,]

##      Fishing_id Location Year season Gillnet_category Net.mesh.gr Net_length_Or
## 1699        244 Upstream 1996   fall            nylon         big             4
##      Soak_time_Or Catch_g
## 1699            3  130933

#remove outlier from the dataset, and check how results have changed
stockC<-stockC[-1699,]

CPUE standartisation using general linear models

Eglė Jakubavičiūtė and Asta Audzijonyte

2022-11-23