A aquisição de dados de Xco2 e SIF, e seus processamentos iniciais pode ser encontrados no link:
Para facilitar o acesso, os dodos foram adquiridos por meio do pacote
{fco2}
.
## Instalando pacotes (se necessário)
# install.packages("devtools")
# Sys.getenv("GITHUB_PAT")
# Sys.unsetenv("GITHUB_PAT")
# Sys.getenv("GITHUB_PAT")
# devtools::install_github("arpanosso/fco2r")
library(readxl)
library(tidyverse)
library(geobr)
library(fco2r)
library(skimr)
library(tidymodels)
library(ISLR)
library(modeldata)
library(vip)
library(ggpubr)
source("R/my_fun.R")
# Definindo o plano de multisession
future::plan("multisession")
dados_estacao <- read_excel("data-raw/xlsx/estacao_meteorologia_ilha_solteira.xlsx", na = "NA")
glimpse(dados_estacao)
#> Rows: 1,826
#> Columns: 16
#> $ data <dttm> 2015-01-01, 2015-01-02, 2015-01-03, 2015-01-04, 2015-01-05, 2~
#> $ Tmed <dbl> 30.5, 30.0, 26.8, 27.1, 27.0, 27.6, 30.2, 28.2, 28.5, 29.9, 30~
#> $ Tmax <dbl> 36.5, 36.7, 35.7, 34.3, 33.2, 36.4, 37.2, 32.4, 37.1, 38.1, 38~
#> $ Tmin <dbl> 24.6, 24.5, 22.9, 22.7, 22.3, 22.8, 22.7, 24.0, 23.0, 23.3, 24~
#> $ Umed <dbl> 66.6, 70.4, 82.7, 76.8, 81.6, 75.5, 65.8, 70.0, 72.9, 67.6, 66~
#> $ Umax <dbl> 89.6, 93.6, 99.7, 95.0, 98.3, 96.1, 99.2, 83.4, 90.7, 97.4, 90~
#> $ Umin <dbl> 42.0, 44.2, 52.9, 43.8, 57.1, 47.5, 34.1, 57.4, 42.7, 38.3, 37~
#> $ PkPa <dbl> 97.2, 97.3, 97.4, 97.5, 97.4, 97.5, 97.4, 97.4, 97.4, 97.4, 97~
#> $ Rad <dbl> 23.6, 24.6, 20.2, 21.4, 17.8, 19.2, 27.0, 15.2, 21.6, 24.3, 24~
#> $ PAR <dbl> 496.6, 513.3, 430.5, 454.0, 378.2, 405.4, 565.7, 317.2, 467.5,~
#> $ Eto <dbl> 5.7, 5.8, 4.9, 5.1, 4.1, 4.8, 6.2, 4.1, 5.5, 5.7, 5.9, 6.1, 6.~
#> $ Velmax <dbl> 6.1, 4.8, 12.1, 6.2, 5.1, 4.5, 4.6, 5.7, 5.8, 5.2, 5.2, 4.7, 6~
#> $ Velmin <dbl> 1.0, 1.0, 1.2, 1.0, 0.8, 0.9, 0.9, 1.5, 1.2, 0.8, 0.8, 1.2, 1.~
#> $ Dir_vel <dbl> 17.4, 261.9, 222.0, 25.0, 56.9, 74.9, 53.4, 89.0, 144.8, 303.9~
#> $ chuva <dbl> 0.0, 0.0, 3.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.~
#> $ inso <dbl> 7.9, 8.7, 5.2, 6.2, 3.4, 4.5, 10.5, 1.3, 6.3, 8.4, 8.6, 7.9, 1~
help(oco2_br)
glimpse(fco2r::oco2_br)
#> Rows: 37,387
#> Columns: 18
#> $ longitude <dbl> -70.5, -~
#> $ longitude_bnds <chr> "-71.0:-~
#> $ latitude <dbl> -5.5, -4~
#> $ latitude_bnds <chr> "-6.0:-5~
#> $ time_yyyymmddhhmmss <dbl> 2.014091~
#> $ time_bnds_yyyymmddhhmmss <chr> "2014090~
#> $ altitude_km <dbl> 3307.8, ~
#> $ alt_bnds_km <chr> "0.0:661~
#> $ fluorescence_radiance_757nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 7.272876~
#> $ fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 2.537127~
#> $ xco2_moles_mole_1 <dbl> 0.000394~
#> $ aerosol_total_aod <dbl> 0.148579~
#> $ fluorescence_offset_relative_771nm_idp <dbl> 0.016753~
#> $ fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1 <dbl> 2.615319~
#> $ fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 3.088582~
#> $ fluorescence_offset_relative_757nm_idp <dbl> 0.013969~
#> $ fluorescence_radiance_771nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 5.577878~
#> $ XCO2 <dbl> 387.2781~
Inicialmente devemos transformar os dados de concentração de
CO2, variável xco2_moles_mole_1
para ppm em seguida devemos
criar as variáveis de data a partir da variável time_yyyymmddhhmmss
.
oco2<-oco2_br %>%
mutate(
xco2 = xco2_moles_mole_1*1e06,
data = ymd_hms(time_yyyymmddhhmmss),
ano = year(data),
mes = month(data),
dia = day(data),
dia_semana = wday(data))
Existe uma tendência de aumento monotônica mundial da concentração de CO2 na atmosfera, assim, ela deve ser retirada para podermos observar as tendências regionais.
oco2 %>%
ggplot(aes(x=data,y=xco2)) +
geom_point(color="blue") +
geom_line(color="red")
Agora devemos
retirar a tendência ao longo do tempo, para isso, dentro do período
específico, faremos a retirada por meio de um ajuste linear:
oco2 %>%
mutate(x= 1:nrow(oco2)) %>%
ggplot(aes(x=data,y=xco2)) +
geom_point(shape=21,color="black",fill="gray") +
geom_smooth(method = "lm") +
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))
Extrair os coeficientes
modelo_linear_tendencia <- lm(xco2~data,
data = oco2)
coefs <- modelo_linear_tendencia$coefficients
Criando a variável xco2_est
a partir da retirada da tendência.
oco2 |>
mutate(
xco2_est = coefs[1] + coefs[2] * as.numeric(data),
delta = xco2_est - xco2,
XCO2 = (coefs[1]-delta) - (mean(xco2) - coefs[1])
)
#> # A tibble: 37,387 x 26
#> longitude longitude~1 latit~2 latit~3 time_~4 time_~5 altit~6 alt_b~7 fluor~8
#> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
#> 1 -70.5 -71.0:-70.0 -5.5 -6.0:-~ 2.01e13 201409~ 3308. 0.0:66~ 7.27e17
#> 2 -70.5 -71.0:-70.0 -4.5 -5.0:-~ 2.01e13 201409~ 3308. 0.0:66~ 7.56e17
#> 3 -69.5 -70.0:-69.0 -10.5 -11.0:~ 2.01e13 201409~ 3308. 0.0:66~ 7.17e17
#> 4 -69.5 -70.0:-69.0 -9.5 -10.0:~ 2.01e13 201409~ 3308. 0.0:66~ 7.34e17
#> 5 -69.5 -70.0:-69.0 -8.5 -9.0:-~ 2.01e13 201409~ 3308. 0.0:66~ 7.68e17
#> 6 -69.5 -70.0:-69.0 -7.5 -8.0:-~ 2.01e13 201409~ 3308. 0.0:66~ 7.46e17
#> 7 -69.5 -70.0:-69.0 -6.5 -7.0:-~ 2.01e13 201409~ 3308. 0.0:66~ 7.40e17
#> 8 -69.5 -70.0:-69.0 -5.5 -6.0:-~ 2.01e13 201409~ 3308. 0.0:66~ 6.91e17
#> 9 -68.5 -69.0:-68.0 -10.5 -11.0:~ 2.01e13 201409~ 3308. 0.0:66~ 6.99e17
#> 10 -46.5 -47.0:-46.0 -1.5 -2.0:-~ 2.01e13 201409~ 3308. 0.0:66~ 7.74e17
#> # ... with 37,377 more rows, 17 more variables:
#> # fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl>,
#> # xco2_moles_mole_1 <dbl>, aerosol_total_aod <dbl>,
#> # fluorescence_offset_relative_771nm_idp <dbl>,
#> # fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1 <dbl>,
#> # fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl>,
#> # fluorescence_offset_relative_757nm_idp <dbl>, ...
glimpse(oco2)
#> Rows: 37,387
#> Columns: 24
#> $ longitude <dbl> -70.5, -~
#> $ longitude_bnds <chr> "-71.0:-~
#> $ latitude <dbl> -5.5, -4~
#> $ latitude_bnds <chr> "-6.0:-5~
#> $ time_yyyymmddhhmmss <dbl> 2.014091~
#> $ time_bnds_yyyymmddhhmmss <chr> "2014090~
#> $ altitude_km <dbl> 3307.8, ~
#> $ alt_bnds_km <chr> "0.0:661~
#> $ fluorescence_radiance_757nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 7.272876~
#> $ fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 2.537127~
#> $ xco2_moles_mole_1 <dbl> 0.000394~
#> $ aerosol_total_aod <dbl> 0.148579~
#> $ fluorescence_offset_relative_771nm_idp <dbl> 0.016753~
#> $ fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1 <dbl> 2.615319~
#> $ fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 3.088582~
#> $ fluorescence_offset_relative_757nm_idp <dbl> 0.013969~
#> $ fluorescence_radiance_771nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 5.577878~
#> $ XCO2 <dbl> 387.2781~
#> $ xco2 <dbl> 394.3686~
#> $ data <dttm> 2014-09~
#> $ ano <dbl> 2014, 20~
#> $ mes <dbl> 9, 9, 9,~
#> $ dia <int> 6, 6, 6,~
#> $ dia_semana <dbl> 7, 7, 7,~
oco2 %>%
ggplot(aes(x=data,y=XCO2)) +
geom_point(shape=21,color="black",fill="gray") +
geom_smooth(method = "lm") +
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))
# oco2 %>%
# sample_n(1000) %>%
# ggplot(aes(x = longitude, y = latitude)) +
# geom_point(color = "blue")
# br <- geobr::read_country(showProgress = FALSE)
# br %>%
# ggplot() +
# geom_sf(fill = "white") +
# geom_point(data=oco2 %>%
# sample_n(1000),
# aes(x=longitude,y=latitude),
# shape=3,
# col="red",
# alpha=0.2)
Observe que utilizamos dplyr::sample_n()
para retirar apenas
# skim(oco2_br)
# help(data_fco2)
glimpse(data_fco2)
#> Rows: 15,397
#> Columns: 39
#> $ experimento <chr> "Espacial", "Espacial", "Espacial", "Espacial", "Esp~
#> $ data <date> 2001-07-10, 2001-07-10, 2001-07-10, 2001-07-10, 200~
#> $ manejo <chr> "convencional", "convencional", "convencional", "con~
#> $ tratamento <chr> "AD_GN", "AD_GN", "AD_GN", "AD_GN", "AD_GN", "AD_GN"~
#> $ revolvimento_solo <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL~
#> $ data_preparo <date> 2001-07-01, 2001-07-01, 2001-07-01, 2001-07-01, 200~
#> $ conversao <date> 1970-01-01, 1970-01-01, 1970-01-01, 1970-01-01, 197~
#> $ cobertura <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE~
#> $ cultura <chr> "milho_soja", "milho_soja", "milho_soja", "milho_soj~
#> $ x <dbl> 0, 40, 80, 10, 25, 40, 55, 70, 20, 40, 60, 10, 70, 3~
#> $ y <dbl> 0, 0, 0, 10, 10, 10, 10, 10, 20, 20, 20, 25, 25, 30,~
#> $ longitude_muni <dbl> 782062.7, 782062.7, 782062.7, 782062.7, 782062.7, 78~
#> $ latitude_muni <dbl> 7647674, 7647674, 7647674, 7647674, 7647674, 7647674~
#> $ estado <chr> "SP", "SP", "SP", "SP", "SP", "SP", "SP", "SP", "SP"~
#> $ municipio <chr> "Jaboticabal", "Jaboticabal", "Jaboticabal", "Jaboti~
#> $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1~
#> $ prof <chr> "0-0.1", "0-0.1", "0-0.1", "0-0.1", "0-0.1", "0-0.1"~
#> $ FCO2 <dbl> 1.080, 0.825, 1.950, 0.534, 0.893, 0.840, 1.110, 1.8~
#> $ Ts <dbl> 18.73, 18.40, 19.20, 18.28, 18.35, 18.47, 19.10, 18.~
#> $ Us <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ pH <dbl> 5.1, 5.1, 5.8, 5.3, 5.5, 5.7, 5.6, 6.4, 5.3, 5.8, 5.~
#> $ MO <dbl> 20, 24, 25, 23, 23, 21, 26, 23, 25, 24, 26, 20, 25, ~
#> $ P <dbl> 46, 26, 46, 78, 60, 46, 55, 92, 55, 60, 48, 71, 125,~
#> $ K <dbl> 2.4, 2.2, 5.3, 3.6, 3.4, 2.9, 4.0, 2.3, 3.3, 3.6, 4.~
#> $ Ca <dbl> 25, 30, 41, 27, 33, 38, 35, 94, 29, 36, 37, 29, 50, ~
#> $ Mg <dbl> 11, 11, 25, 11, 15, 20, 16, 65, 11, 17, 15, 11, 30, ~
#> $ H_Al <dbl> 31, 31, 22, 28, 27, 22, 22, 12, 31, 28, 28, 31, 18, ~
#> $ SB <dbl> 38.4, 43.2, 71.3, 41.6, 50.6, 60.9, 55.0, 161.3, 43.~
#> $ CTC <dbl> 69.4, 74.2, 93.3, 69.6, 77.9, 82.9, 77.0, 173.3, 74.~
#> $ V <dbl> 55, 58, 76, 60, 65, 73, 71, 93, 58, 67, 67, 58, 82, ~
#> $ Ds <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ Macro <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ Micro <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ VTP <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ PLA <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ AT <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ SILTE <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ ARG <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ HLIFS <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
data_fco2 %>%
group_by(experimento, cultura, data) %>%
summarise(FCO2 = mean(FCO2, na.rm=TRUE)) %>%
ggplot(aes(y=FCO2, x=data)) +
geom_line() +
facet_wrap(~experimento + cultura, scale="free")
# br %>%
# ggplot() +
# geom_sf(fill = "white") +
# geom_point(data=oco2 %>% sample_n(1000),
# aes(x=longitude,y=latitude),
# shape=3,
# col="red",
# alpha=0.2)
Observe que utilizamos dplyr::sample_n()
para retirar apenas
# skim(data_fco2)
visdat::vis_miss(data_fco2 %>%
sample_n(15000))
# skim(dados_estacao)
dados_estacao <- dados_estacao %>%
drop_na()
visdat::vis_miss(dados_estacao)
# Lista do xCO2
# 01 passar as datas que estão em ano-mes-dia-horas-min-segundos
# para uma outra coluna denominada 'data' como ano-mes-dia
# Fazer em pipeline, usar o mutate do pacote dplyr e provavelmente
# a funçoes do pacote lubridate
oco2 <- oco2 %>%
mutate (
ano = time_yyyymmddhhmmss%/%1e10,
mês = time_yyyymmddhhmmss%/%1e8 %%100,
dia = time_yyyymmddhhmmss%/%1e6 %%100,
data = as.Date(stringr::str_c(ano,mês,dia,sep="-"))
) %>%
glimpse()
#> Rows: 37,387
#> Columns: 25
#> $ longitude <dbl> -70.5, -~
#> $ longitude_bnds <chr> "-71.0:-~
#> $ latitude <dbl> -5.5, -4~
#> $ latitude_bnds <chr> "-6.0:-5~
#> $ time_yyyymmddhhmmss <dbl> 2.014091~
#> $ time_bnds_yyyymmddhhmmss <chr> "2014090~
#> $ altitude_km <dbl> 3307.8, ~
#> $ alt_bnds_km <chr> "0.0:661~
#> $ fluorescence_radiance_757nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 7.272876~
#> $ fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 2.537127~
#> $ xco2_moles_mole_1 <dbl> 0.000394~
#> $ aerosol_total_aod <dbl> 0.148579~
#> $ fluorescence_offset_relative_771nm_idp <dbl> 0.016753~
#> $ fluorescence_at_reference_ph_sec_1_m_2_sr_1_um_1 <dbl> 2.615319~
#> $ fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 3.088582~
#> $ fluorescence_offset_relative_757nm_idp <dbl> 0.013969~
#> $ fluorescence_radiance_771nm_uncert_idp_ph_sec_1_m_2_sr_1_um_1 <dbl> 5.577878~
#> $ XCO2 <dbl> 387.2781~
#> $ xco2 <dbl> 394.3686~
#> $ data <date> 2014-09~
#> $ ano <dbl> 2014, 20~
#> $ mes <dbl> 9, 9, 9,~
#> $ dia <dbl> 6, 6, 6,~
#> $ dia_semana <dbl> 7, 7, 7,~
#> $ mês <dbl> 9, 9, 9,~
dados_estacao <- dados_estacao %>%
mutate(
ano = lubridate::year(data),
mês = lubridate::month(data),
dia = lubridate::day(data),
data = as.Date(stringr::str_c(ano,mês,dia,sep="-"))
)
# atributos <- data_fco2
atributos <- left_join(data_fco2, dados_estacao, by = "data")
# Lista das datas de FCO2
lista_data_fco2 <- unique(atributos$data)
lista_data_oco2 <- unique(oco2$data)
lista_data_estacao <- unique(dados_estacao$data)
datas_fco2 <- paste0(lubridate::year(lista_data_fco2),"-",lubridate::month(lista_data_fco2)) %>% unique()
datas_oco2 <- paste0(lubridate::year(lista_data_oco2),"-",lubridate::month(lista_data_oco2)) %>% unique()
datas <- datas_fco2[datas_fco2 %in% datas_oco2]
Criação as listas de datas, que é chave para a mesclagem dos arquivos.
fco2 <- atributos %>%
mutate(ano_mes = paste0(lubridate::year(data),"-",lubridate::month(data))) %>%
dplyr::filter(ano_mes %in% datas)
xco2 <- oco2 %>%
mutate(ano_mes=paste0(ano,"-",mês)) %>%
dplyr::filter(ano_mes %in% datas)
Coordenadas das cidades
unique(xco2$ano_mes)[unique(xco2$ano_mes) %>% order()] ==
unique(fco2$ano_mes)[unique(fco2$ano_mes) %>% order()]
#> [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#> [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
Abordagem usando o join do {dplyr}
memory.limit(size=10001)
#> [1] 10001
data_set <- left_join(fco2 %>%
mutate(ano = lubridate::year(data),
mes = lubridate::month(data)
) %>%
select(ID, data, cultura, ano, mes, x,y, FCO2, Ts,
Us, MO, Macro, VTP, ARG, ano_mes,Tmed,Tmax, Tmin, Umed,
Umax, Umin, PkPa, Rad, Eto, Velmax, Velmin, Dir_vel,
chuva, inso),
xco2 %>%
select(data,mês,dia,longitude,latitude,XCO2,fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1,fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1, ano_mes), by = "ano_mes") %>%
mutate(dist = sqrt((longitude-(-51.423519))^2+(latitude-(-20.362911))^2),
SIF = (fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1*2.6250912*10^(-19) + 1.5*fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1* 2.57743*10^(-19))/2)
data_set<-data_set %>%
select(-fluorescence_radiance_757nm_idp_ph_sec_1_m_2_sr_1_um_1, -fluorescence_radiance_771nm_idp_ph_sec_1_m_2_sr_1_um_1 ) %>%
filter(dist <= .16, FCO2 <= 20 )
visdat::vis_miss(data_set %>%
sample_n(2000)
)
# head(data_set)
# fco2$ano_mes %>% unique()
# xco2$ano_mes %>% unique()
# data_set$ano_mes %>% unique()
tab_medias <- data_set %>%
# mutate(SIF = ifelse(SIF <=0, mean(data_set$SIF, na.rm=TRUE),SIF)) %>%
group_by(ano_mes, cultura) %>%
summarise(FCO2 = mean(FCO2, na.rm=TRUE),
XCO2 = mean(XCO2, na.rm=TRUE),
SIF = mean(SIF, na.rm=TRUE))
tab_medias %>%
ggplot(aes(x=XCO2, y=SIF)) +
geom_point()+
geom_smooth(method = "lm")+
theme_bw()
tab_medias %>%
ggplot(aes(x=XCO2, y=FCO2)) +
geom_point()+
geom_smooth(method = "lm")+
theme_bw()
tab_medias %>%
ggplot(aes(x=FCO2, y=SIF)) +
geom_point()+
geom_smooth(method = "lm") +
theme_bw()
Completar posteriormente.
Definindo a semente aleatória mais o conjunto de dados para teste e treino dos modelos
data_set_ml <- data_set #%>%
# select(cultura, FCO2, Ts,
# Us, MO, Tmed,Tmax, Tmin, Umed,
# Umax, Umin, PkPa, Rad, Eto, Velmax, Velmin, Dir_vel,
# chuva, inso, SIF, xco2) %>%
# drop_na(FCO2, Ts,Us,Tmed:inso)
# visdat::vis_miss(data_set_ml)
# set.seed(1235)
fco2_initial_split <- initial_split(data_set_ml, prop = 0.75)
fco2_train <- training(fco2_initial_split)
# fco2_test <- testing(fco2_initial_split)
# visdat::vis_miss(fco2_test)
fco2_train %>%
ggplot(aes(x=FCO2, y=..density..))+
geom_histogram(bins = 30, color="black", fill="lightgray")+
geom_density(alpha=.05,fill="red")+
theme_bw() +
labs(x="FCO2", y = "Densidade")
fco2_train %>%
ggplot(aes(x=SIF, y=..density..))+
geom_histogram(bins = 11, color="black", fill="lightgray")+
geom_density(alpha=.05,fill="green")+
theme_bw() +
labs(x="SIF", y = "Densidade")
fco2_train %>%
ggplot(aes(x=XCO2, y=..density..))+
geom_histogram(bins = 15, color="black", fill="lightgray")+
geom_density(alpha=.05,fill="blue")+
theme_bw() +
labs(x="XCO2", y = "Densidade")
glimpse(fco2_train)
#> Rows: 6,103
#> Columns: 37
#> $ ID <int> 31, 35, 21, 3, 21, 76, 31, 35, 12, 45, 56, 49, 3, 21, 16, 19~
#> $ data.x <date> 2018-07-04, 2018-05-25, 2019-07-07, 2017-03-17, 2017-03-17,~
#> $ cultura <chr> "silvipastoril", "pasto", "pasto", "silvipastoril", "silvipa~
#> $ ano <dbl> 2018, 2018, 2019, 2017, 2017, 2017, 2018, 2017, 2018, 2018, ~
#> $ mes <dbl> 7, 5, 7, 3, 3, 6, 5, 6, 7, 6, 6, 10, 7, 7, 6, 2, 6, 9, 6, 2,~
#> $ x <dbl> 7749399, 7747950, 7747952, 40, 100, 105, 7747956, 60, 774945~
#> $ y <dbl> 457163.2, 456884.4, 456878.7, 0.0, 10.0, 40.0, 456862.8, 20.~
#> $ FCO2 <dbl> 1.06, 3.19, 1.23, 7.41, 5.36, 3.17, 1.16, 2.97, 1.13, 1.72, ~
#> $ Ts <dbl> 23.20000, 24.20000, 12.10000, 23.17000, 23.17000, 20.70000, ~
#> $ Us <dbl> 11.11000, 11.40673, 10.70000, 30.08400, 30.29000, 11.00000, ~
#> $ MO <dbl> 4, 12, 20, 29, 32, 24, 13, 26, 28, 29, 8, 2, 13, 9, 37, 29, ~
#> $ Macro <dbl> 11.45000000, 0.28000000, 0.02000000, NA, 10.94000000, NA, 0.~
#> $ VTP <dbl> 57.11000, 0.44000, 0.37000, NA, 49.77000, NA, 0.35000, NA, 5~
#> $ ARG <dbl> NA, 137.6400, 91.6200, NA, NA, NA, 112.1100, NA, 626.9800, 5~
#> $ ano_mes <chr> "2018-7", "2018-5", "2019-7", "2017-3", "2017-3", "2017-6", ~
#> $ Tmed <dbl> 23.9, 23.0, 13.1, 25.8, 25.8, 22.0, 23.0, 23.7, 15.8, 25.3, ~
#> $ Tmax <dbl> 33.4, 31.3, 22.2, 33.6, 33.6, 30.0, 31.3, 31.5, 21.8, 33.9, ~
#> $ Tmin <dbl> 16.0, 15.5, 5.1, 22.2, 22.2, 15.4, 15.5, 17.3, 11.9, 17.8, 1~
#> $ Umed <dbl> 60.1, 67.3, 61.8, 86.5, 86.5, 74.7, 67.3, 67.7, 66.4, 57.1, ~
#> $ Umax <dbl> 98.6, 90.8, 90.5, 99.9, 99.9, 90.8, 90.8, 94.8, 85.4, 93.3, ~
#> $ Umin <dbl> 27.7, 43.8, 32.3, 50.3, 50.3, 54.7, 43.8, 38.8, 46.5, 27.2, ~
#> $ PkPa <dbl> 97.6, 97.9, 98.4, 97.3, 97.3, 97.8, 97.9, 97.6, 98.3, 97.6, ~
#> $ Rad <dbl> 12.5, 12.7, 11.8, 14.9, 14.9, 13.8, 12.7, 13.7, 13.2, 12.5, ~
#> $ Eto <dbl> 2.8, 2.9, 2.3, 3.5, 3.5, 2.8, 2.9, 3.1, 2.8, 3.0, 3.2, 4.9, ~
#> $ Velmax <dbl> 4.3, 5.5, 5.9, 5.2, 5.2, 4.6, 5.5, 5.2, 6.4, 4.7, 5.9, 7.1, ~
#> $ Velmin <dbl> 0.8, 1.1, 1.1, 0.7, 0.7, 1.2, 1.1, 1.2, 2.1, 1.0, 1.6, 1.0, ~
#> $ Dir_vel <dbl> 85.2, 107.2, 129.1, 264.9, 264.9, 93.0, 107.2, 72.5, 235.2, ~
#> $ chuva <dbl> 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ~
#> $ inso <dbl> 5.5, 5.3, 4.7, 3.6, 3.6, 8.2, 5.3, 8.0, 6.1, 6.8, 8.0, 6.0, ~
#> $ data.y <date> 2018-07-09, 2018-05-06, 2019-07-28, 2017-03-16, 2017-03-16,~
#> $ mês <dbl> 7, 5, 7, 3, 3, 6, 5, 6, 7, 6, 6, 10, 7, 7, 6, 2, 6, 9, 6, 2,~
#> $ dia <dbl> 9, 6, 28, 16, 16, 29, 15, 29, 9, 16, 29, 9, 28, 28, 29, 26, ~
#> $ longitude <dbl> -51.5, -51.5, -51.5, -51.5, -51.5, -51.5, -51.5, -51.5, -51.~
#> $ latitude <dbl> -20.5, -20.5, -20.5, -20.5, -20.5, -20.5, -20.5, -20.5, -20.~
#> $ XCO2 <dbl> 381.6135, 385.5825, 387.1356, 384.1294, 384.1294, 386.6562, ~
#> $ dist <dbl> 0.1569801, 0.1569801, 0.1569801, 0.1569801, 0.1569801, 0.156~
#> $ SIF <dbl> 0.31308794, 0.12510317, 0.30243534, 0.66028494, 0.66028494, ~
fco2_train %>% select(-c(ID,ano,mes,x,y,latitude,longitude,dist,mês,dia)) %>%
select(where(is.numeric)) %>%
drop_na() %>%
cor() %>%
corrplot::corrplot()
fco2_recipe <- recipe(FCO2 ~ ., data = fco2_train %>%
select(-c(data.x,data.y,ID,ano,mes,x,y,latitude,longitude,dist,mês,dia,ano_mes))
) %>%
step_normalize(all_numeric_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
#step_naomit(c(Ts, Us)) %>%
#step_impute_mean(c(Us,Ts)) %>%
#step_poly(c(Us,Ts), degree = 2) %>%
step_dummy(all_nominal_predictors())
bake(prep(fco2_recipe), new_data = NULL)
#> # A tibble: 6,103 x 29
#> Ts Us MO Macro VTP ARG Tmed Tmax Tmin Umed
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.0874 -0.380 -1.80 1.15 1.04 NA -0.00687 0.422 -0.430 -1.03
#> 2 0.260 -0.340 -1.01 -0.718 -1.56 -0.956 -0.210 -0.0192 -0.541 -0.286
#> 3 -1.83 -0.437 -0.215 -0.762 -1.57 -1.20 -2.45 -1.93 -2.86 -0.856
#> 4 0.0822 2.23 0.678 NA NA NA 0.422 0.465 0.951 1.71
#> 5 0.0822 2.26 0.976 1.06 0.707 NA 0.422 0.465 0.951 1.71
#> 6 -0.344 -0.395 0.182 NA NA NA -0.436 -0.293 -0.563 0.483
#> 7 0.0874 -0.178 -0.909 -0.732 -1.57 -1.09 -0.210 -0.0192 -0.541 -0.286
#> 8 -0.982 1.10 0.381 NA NA NA -0.0521 0.0229 -0.140 -0.244
#> 9 -1.09 0.921 0.579 -0.141 0.776 1.59 -1.84 -2.02 -1.34 -0.379
#> 10 -0.395 0.274 0.678 0.579 0.862 1.41 0.309 0.528 -0.0288 -1.34
#> # ... with 6,093 more rows, and 19 more variables: Umax <dbl>, Umin <dbl>,
#> # PkPa <dbl>, Rad <dbl>, Eto <dbl>, Velmax <dbl>, Velmin <dbl>,
#> # Dir_vel <dbl>, chuva <dbl>, inso <dbl>, XCO2 <dbl>, SIF <dbl>, FCO2 <dbl>,
#> # cultura_eucalipto <dbl>, cultura_mata.ciliar <dbl>, cultura_pasto <dbl>,
#> # cultura_pinus <dbl>, cultura_silvipastoril <dbl>, cultura_new <dbl>
visdat::vis_miss(bake(prep(fco2_recipe), new_data = NULL))
fco2_resamples <- vfold_cv(fco2_train, v = 10)
grid <- grid_regular(
penalty(range = c(-8, 0)),
levels = 20
)
fco2_dt_model <- decision_tree(
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) %>%
set_mode("regression") %>%
set_engine("rpart")
fco2_dt_wf <- workflow() %>%
add_model(fco2_dt_model) %>%
add_recipe(fco2_recipe)
grid_dt <- grid_random(
cost_complexity(c(-6, -4)),
tree_depth(range = c(8, 18)),
min_n(range = c(42, 52)),
size = 2
)
glimpse(grid_dt)
#> Rows: 2
#> Columns: 3
#> $ cost_complexity <dbl> 1.921168e-06, 3.382433e-06
#> $ tree_depth <int> 17, 15
#> $ min_n <int> 52, 43
fco2_dt_tune_grid <- tune_grid(
fco2_dt_wf,
resamples = fco2_resamples,
grid = grid_dt,
metrics = metric_set(rmse)
)
autoplot(fco2_dt_tune_grid)
collect_metrics(fco2_dt_tune_grid)
#> # A tibble: 2 x 9
#> cost_complexity tree_depth min_n .metric .estima~1 mean n std_err .config
#> <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 0.00000192 17 52 rmse standard 1.25 10 0.0292 Prepro~
#> 2 0.00000338 15 43 rmse standard 1.24 10 0.0314 Prepro~
#> # ... with abbreviated variable name 1: .estimator
fco2_dt_best_params <- select_best(fco2_dt_tune_grid, "rmse")
fco2_dt_wf <- fco2_dt_wf %>% finalize_workflow(fco2_dt_best_params)
fco2_dt_last_fit <- last_fit(fco2_dt_wf, fco2_initial_split)
fco2_test_preds <- bind_rows(
collect_predictions(fco2_dt_last_fit) %>% mutate(modelo = "dt")
)
fco2_test <- testing(fco2_initial_split)
visdat::vis_miss(fco2_test)
fco2_test_preds %>%
ggplot(aes(x=.pred, y=FCO2)) +
geom_point()+
theme_bw() +
geom_smooth(method = "lm") +
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))
fco2_dt_last_fit_model <-fco2_dt_last_fit$.workflow[[1]]$fit$fit
vip(fco2_dt_last_fit_model)
da <- fco2_test_preds %>%
filter(FCO2 > 0, .pred>0 )
my_rmse <- Metrics::rmse(da$FCO2,
da$.pred)
my_mape <- Metrics::mape(da$FCO2,da$.pred)*100
fco2_test_preds %>%
ggplot(aes(x=FCO2,y=.pred))+
geom_point()+
geom_smooth(method = "lm")+
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5)+
ggplot2::annotate('text',x=10.4,y=16.7,label=paste0('RMSE = ',round(my_rmse,2),', MAPE = '
,round(my_mape,2),'%'),size=5)+
theme_bw()
Corrigindo os NAs
no teste.
visdat::vis_miss(fco2_test)
data_set_ml <- data_set_ml %>%
select(cultura, FCO2, Ts, XCO2, SIF,
Us, MO, Tmed,Tmax, Tmin, Umed,
Umax, Umin, PkPa, Rad, Eto, Velmax, Velmin, Dir_vel,
chuva, inso) %>%
drop_na(FCO2, Ts,Us,Tmed:inso)
visdat::vis_miss(data_set_ml)
fco2_initial_split <- initial_split(data_set_ml, prop = 0.75)
fco2_test <- testing(fco2_initial_split)
visdat::vis_miss(fco2_test)
fco2_train <- training(fco2_initial_split)
visdat::vis_miss(fco2_train)
fco2_resamples_rf <- vfold_cv(fco2_train, v = 5)
fco2_rf_recipe <- recipe(FCO2 ~ ., data = fco2_train) %>%
step_string2factor(all_nominal(), skip = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
# step_naomit(all_predictors()) #%>%
# step_impute_mean(c(Ts,Us)) %>%
# step_poly(c(Ts, Us), degree = 2) %>%
step_dummy(all_nominal_predictors())
bake(prep(fco2_rf_recipe), new_data = NULL)
#> # A tibble: 5,459 x 26
#> Ts XCO2 SIF Us MO Tmed Tmax Tmin Umed Umax
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.193 0.114 0.107 -0.181 0.673 -0.429 -0.279 -0.564 0.466 -0.297
#> 2 1.10 -0.832 0.474 0.489 0.769 0.429 0.288 1.04 2.19 1.03
#> 3 0.647 -0.743 1.20 -0.987 0.577 0.948 0.666 1.08 -0.0100 0.0922
#> 4 -0.00815 1.27 -1.26 -0.0801 -0.767 -0.181 -0.0479 -0.319 -0.941 -0.786
#> 5 -0.479 1.27 -1.26 0.361 0.481 -0.542 -0.258 -0.742 0.621 1.03
#> 6 0.949 0.734 1.53 -0.289 -0.575 0.632 0.898 0.840 1.41 1.03
#> 7 0.513 -0.177 -0.533 -0.569 -0.671 0.158 -0.0899 0.350 -1.37 -2.72
#> 8 0.513 1.27 -1.26 0.324 -1.73 -0.632 -0.720 -0.453 1.10 0.625
#> 9 0.882 -0.177 -0.533 -0.915 -0.767 0.158 -0.0899 0.350 -1.37 -2.72
#> 10 -0.0250 -1.97 -0.180 1.78 -1.44 -1.40 -2.15 -0.698 1.30 0.553
#> # ... with 5,449 more rows, and 16 more variables: Umin <dbl>, PkPa <dbl>,
#> # Rad <dbl>, Eto <dbl>, Velmax <dbl>, Velmin <dbl>, Dir_vel <dbl>,
#> # chuva <dbl>, inso <dbl>, FCO2 <dbl>, cultura_eucalipto <dbl>,
#> # cultura_mata.ciliar <dbl>, cultura_pasto <dbl>, cultura_pinus <dbl>,
#> # cultura_silvipastoril <dbl>, cultura_new <dbl>
visdat::vis_miss(bake(prep(fco2_rf_recipe), new_data = NULL))
fco2_rf_model <- rand_forest(
min_n = tune(),
mtry = tune(),
trees = tune()
) %>%
set_mode("regression") %>%
set_engine("randomForest")
fco2_rf_wf <- workflow() %>%
add_model(fco2_rf_model) %>%
add_recipe(fco2_rf_recipe)
mtry trees min_n .config 10 769 21 Preprocessor1_Model39
grid_rf <- grid_random(
min_n(range = c(20, 30)),
mtry(range = c(10,20)),
trees(range = c(769,1500) ),
size = 20
)
fco2_rf_tune_grid <- tune_grid(
fco2_rf_wf,
resamples = fco2_resamples_rf,
grid = grid_rf,
metrics = metric_set(rmse)
)
autoplot(fco2_rf_tune_grid)
collect_metrics(fco2_rf_tune_grid)
#> # A tibble: 20 x 9
#> mtry trees min_n .metric .estimator mean n std_err .config
#> <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 12 1452 20 rmse standard 1.14 5 0.0297 Preprocessor1_Model~
#> 2 20 1372 21 rmse standard 1.13 5 0.0317 Preprocessor1_Model~
#> 3 15 1255 24 rmse standard 1.14 5 0.0305 Preprocessor1_Model~
#> 4 18 1488 26 rmse standard 1.14 5 0.0302 Preprocessor1_Model~
#> 5 18 1406 26 rmse standard 1.14 5 0.0307 Preprocessor1_Model~
#> 6 11 1281 22 rmse standard 1.15 5 0.0291 Preprocessor1_Model~
#> 7 14 1443 25 rmse standard 1.14 5 0.0296 Preprocessor1_Model~
#> 8 17 1045 22 rmse standard 1.14 5 0.0303 Preprocessor1_Model~
#> 9 12 929 25 rmse standard 1.15 5 0.0291 Preprocessor1_Model~
#> 10 19 1110 20 rmse standard 1.13 5 0.0319 Preprocessor1_Model~
#> 11 20 899 25 rmse standard 1.14 5 0.0305 Preprocessor1_Model~
#> 12 18 1137 28 rmse standard 1.14 5 0.0303 Preprocessor1_Model~
#> 13 17 797 22 rmse standard 1.14 5 0.0312 Preprocessor1_Model~
#> 14 14 787 27 rmse standard 1.14 5 0.0292 Preprocessor1_Model~
#> 15 15 876 25 rmse standard 1.14 5 0.0302 Preprocessor1_Model~
#> 16 20 1077 26 rmse standard 1.14 5 0.0306 Preprocessor1_Model~
#> 17 14 1220 27 rmse standard 1.14 5 0.0297 Preprocessor1_Model~
#> 18 16 1010 20 rmse standard 1.14 5 0.0315 Preprocessor1_Model~
#> 19 20 1128 22 rmse standard 1.14 5 0.0315 Preprocessor1_Model~
#> 20 16 1214 30 rmse standard 1.14 5 0.0298 Preprocessor1_Model~
fco2_rf_best_params <- select_best(fco2_rf_tune_grid, "rmse")
fco2_rf_wf <- fco2_rf_wf %>% finalize_workflow(fco2_rf_best_params)
fco2_rf_last_fit <- last_fit(fco2_rf_wf, fco2_initial_split)
fco2_test_preds <- bind_rows(
collect_predictions(fco2_rf_last_fit) %>% mutate(modelo = "rf")
)
fco2_test_preds %>%
ggplot(aes(x=.pred, y=FCO2)) +
geom_point()+
theme_bw() +
geom_smooth(method = "lm") +
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))
fco2_rf_last_fit_model <-fco2_rf_last_fit$.workflow[[1]]$fit$fit
vip(fco2_rf_last_fit_model)
da <- fco2_test_preds %>%
filter(FCO2 > 0, .pred> 0 )
my_rmse <- Metrics::rmse(da$FCO2,
da$.pred)
my_mape <- Metrics::mape(da$FCO2,da$.pred)*100
fco2_test_preds %>%
ggplot(aes(x=FCO2,y=.pred))+
geom_point()+
geom_smooth(method = "lm")+
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5)+
ggplot2::annotate('text',x=10.4,y=16.7,label=paste0('RMSE = ',round(my_rmse,2),', MAPE = '
,round(my_mape,2),'%'),size=5)+
theme_bw()
fco2_xgb_recipe <- recipe(FCO2 ~ ., data = fco2_train) %>%
step_string2factor(all_nominal(), skip = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_novel(all_nominal_predictors()) %>%
# step_zv(all_predictors()) %>%
# step_naomit(all_predictors()) #%>%
#step_poly(c(Ts, Us), degree = 3) %>%
step_dummy(all_nominal_predictors())
bake(prep(fco2_xgb_recipe), new_data = NULL)
#> # A tibble: 5,459 x 26
#> Ts XCO2 SIF Us MO Tmed Tmax Tmin Umed Umax
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -0.193 0.114 0.107 -0.181 0.673 -0.429 -0.279 -0.564 0.466 -0.297
#> 2 1.10 -0.832 0.474 0.489 0.769 0.429 0.288 1.04 2.19 1.03
#> 3 0.647 -0.743 1.20 -0.987 0.577 0.948 0.666 1.08 -0.0100 0.0922
#> 4 -0.00815 1.27 -1.26 -0.0801 -0.767 -0.181 -0.0479 -0.319 -0.941 -0.786
#> 5 -0.479 1.27 -1.26 0.361 0.481 -0.542 -0.258 -0.742 0.621 1.03
#> 6 0.949 0.734 1.53 -0.289 -0.575 0.632 0.898 0.840 1.41 1.03
#> 7 0.513 -0.177 -0.533 -0.569 -0.671 0.158 -0.0899 0.350 -1.37 -2.72
#> 8 0.513 1.27 -1.26 0.324 -1.73 -0.632 -0.720 -0.453 1.10 0.625
#> 9 0.882 -0.177 -0.533 -0.915 -0.767 0.158 -0.0899 0.350 -1.37 -2.72
#> 10 -0.0250 -1.97 -0.180 1.78 -1.44 -1.40 -2.15 -0.698 1.30 0.553
#> # ... with 5,449 more rows, and 16 more variables: Umin <dbl>, PkPa <dbl>,
#> # Rad <dbl>, Eto <dbl>, Velmax <dbl>, Velmin <dbl>, Dir_vel <dbl>,
#> # chuva <dbl>, inso <dbl>, FCO2 <dbl>, cultura_eucalipto <dbl>,
#> # cultura_mata.ciliar <dbl>, cultura_pasto <dbl>, cultura_pinus <dbl>,
#> # cultura_silvipastoril <dbl>, cultura_new <dbl>
visdat::vis_miss(bake(prep(fco2_xgb_recipe), new_data = NULL))
Achar uma combinação learning_rate
e trees
que funciona
relativamente bem.
-
learn_rate - 0.05, 0.1, 0.3
-
trees - 100, 500, 1000, 1500
cores = 4
fco2_xgb_model <- boost_tree(
mtry = 0.8,
trees = tune(), # <---------------
min_n = 5,
tree_depth = 4,
loss_reduction = 0, # lambda
learn_rate = tune(), # epsilon
sample_size = 0.8
) %>%
set_mode("regression") %>%
set_engine("xgboost", nthread = cores, counts = FALSE)
fco2_xgb_wf <- workflow() %>%
add_model(fco2_xgb_model) %>%
add_recipe(fco2_xgb_recipe)
grid_xgb <- expand.grid(
learn_rate = c(0.05, 0.3),
trees = c(2, 250, 500)
)
fco2_xgb_tune_grid <- tune_grid(
fco2_xgb_wf,
resamples = fco2_resamples,
grid = grid_xgb,
metrics = metric_set(rmse)
)
autoplot(fco2_xgb_tune_grid)
fco2_xgb_tune_grid %>% show_best(metric = "rmse", n = 6)
#> # A tibble: 6 x 8
#> trees learn_rate .metric .estimator mean n std_err .config
#> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 500 0.05 rmse standard 1.16 10 0.0292 Preprocessor1_Model3
#> 2 250 0.05 rmse standard 1.17 10 0.0285 Preprocessor1_Model2
#> 3 250 0.3 rmse standard 1.18 10 0.0309 Preprocessor1_Model5
#> 4 500 0.3 rmse standard 1.19 10 0.0312 Preprocessor1_Model6
#> 5 2 0.3 rmse standard 2.03 10 0.0365 Preprocessor1_Model4
#> 6 2 0.05 rmse standard 3.11 10 0.0400 Preprocessor1_Model1
fco2_xgb_select_best_passo1 <- fco2_xgb_tune_grid %>%
select_best(metric = "rmse")
fco2_xgb_select_best_passo1
#> # A tibble: 1 x 3
#> trees learn_rate .config
#> <dbl> <dbl> <chr>
#> 1 500 0.05 Preprocessor1_Model3
São bons valores inciais. Agora, podemos tunar os parâmetros relacionados à árvore.
- tree_depth: vamos deixar ele variar entre 3 e 10.
- min_n: vamos deixar variar entre 5 e 90.
fco2_xgb_model <- boost_tree(
mtry = 0.8,
trees = fco2_xgb_select_best_passo1$trees,
min_n = tune(),
tree_depth = tune(),
loss_reduction = 0,
learn_rate = fco2_xgb_select_best_passo1$learn_rate,
sample_size = 0.8
) %>%
set_mode("regression") %>%
set_engine("xgboost", nthread = cores, counts = FALSE)
#### Workflow
fco2_xgb_wf <- workflow() %>%
add_model(fco2_xgb_model) %>%
add_recipe(fco2_xgb_recipe)
#### Grid
fco2_xgb_grid <- expand.grid(
tree_depth = c(1, 3, 4),
min_n = c(5, 30, 60)
)
fco2_xgb_tune_grid <- fco2_xgb_wf %>%
tune_grid(
resamples =fco2_resamples,
grid = fco2_xgb_grid,
control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
metrics = metric_set(rmse)
)
#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)
fco2_xgb_tune_grid %>% show_best(metric = "rmse", n = 5)
#> # A tibble: 5 x 8
#> min_n tree_depth .metric .estimator mean n std_err .config
#> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 5 4 rmse standard 1.17 10 0.0285 Preprocessor1_Model3
#> 2 30 4 rmse standard 1.18 10 0.0297 Preprocessor1_Model6
#> 3 5 3 rmse standard 1.18 10 0.0290 Preprocessor1_Model2
#> 4 60 4 rmse standard 1.19 10 0.0275 Preprocessor1_Model9
#> 5 30 3 rmse standard 1.20 10 0.0288 Preprocessor1_Model5
fco2_xgb_select_best_passo2 <- fco2_xgb_tune_grid %>% select_best(metric = "rmse")
fco2_xgb_select_best_passo2
#> # A tibble: 1 x 3
#> min_n tree_depth .config
#> <dbl> <dbl> <chr>
#> 1 5 4 Preprocessor1_Model3
Vamos tunar o loss_reduction
fco2_xgb_model <- boost_tree(
mtry = 0.8,
trees = fco2_xgb_select_best_passo1$trees,
min_n = fco2_xgb_select_best_passo2$min_n,
tree_depth = fco2_xgb_select_best_passo2$tree_depth,
loss_reduction =tune(),
learn_rate = fco2_xgb_select_best_passo1$learn_rate,
sample_size = 0.8
) %>%
set_mode("regression") %>%
set_engine("xgboost", nthread = cores, counts = FALSE)
#### Workflow
fco2_xgb_wf <- workflow() %>%
add_model(fco2_xgb_model) %>%
add_recipe(fco2_xgb_recipe)
#### Grid
fco2_xgb_grid <- expand.grid(
loss_reduction = c(0.01, 0.05, 1, 2, 4, 8)
)
fco2_xgb_tune_grid <- fco2_xgb_wf %>%
tune_grid(
resamples = fco2_resamples,
grid = fco2_xgb_grid,
control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
metrics = metric_set(rmse)
)
#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)
fco2_xgb_tune_grid %>% show_best(metric = "rmse", n = 5)
#> # A tibble: 5 x 7
#> loss_reduction .metric .estimator mean n std_err .config
#> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 0.05 rmse standard 1.16 10 0.0293 Preprocessor1_Model2
#> 2 0.01 rmse standard 1.16 10 0.0304 Preprocessor1_Model1
#> 3 2 rmse standard 1.17 10 0.0286 Preprocessor1_Model4
#> 4 1 rmse standard 1.17 10 0.0286 Preprocessor1_Model3
#> 5 4 rmse standard 1.17 10 0.0294 Preprocessor1_Model5
fco2_xgb_select_best_passo3 <- fco2_xgb_tune_grid %>% select_best(metric = "rmse")
fco2_xgb_select_best_passo3
#> # A tibble: 1 x 2
#> loss_reduction .config
#> <dbl> <chr>
#> 1 0.05 Preprocessor1_Model2
Vamos então tunar o mtry
e o sample_size
:
fco2_xgb_model <- boost_tree(
mtry = tune(),
trees = fco2_xgb_select_best_passo1$trees,
min_n = fco2_xgb_select_best_passo2$min_n,
tree_depth = fco2_xgb_select_best_passo2$tree_depth,
loss_reduction = fco2_xgb_select_best_passo3$loss_reduction,
learn_rate = fco2_xgb_select_best_passo1$learn_rate,
sample_size = tune()
)%>%
set_mode("regression") |>
set_engine("xgboost", nthread = cores, counts = FALSE)
#### Workflow
fco2_xgb_wf <- workflow() %>%
add_model(fco2_xgb_model) %>%
add_recipe(fco2_xgb_recipe)
#### Grid
fco2_xgb_grid <- expand.grid(
sample_size = seq(0.5, 1.0, length.out = 2), ## <---
mtry = seq(0.1, 1.0, length.out = 2) ## <---
)
fco2_xgb_tune_grid <- fco2_xgb_wf %>%
tune_grid(
resamples = fco2_resamples,
grid = fco2_xgb_grid,
control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
metrics = metric_set(rmse)
)
#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)
fco2_xgb_tune_grid |> show_best(metric = "rmse", n = 5)
#> # A tibble: 4 x 8
#> mtry sample_size .metric .estimator mean n std_err .config
#> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 1 1 rmse standard 1.16 10 0.0275 Preprocessor1_Model4
#> 2 1 0.5 rmse standard 1.18 10 0.0297 Preprocessor1_Model3
#> 3 0.1 1 rmse standard 1.19 10 0.0277 Preprocessor1_Model2
#> 4 0.1 0.5 rmse standard 1.19 10 0.0279 Preprocessor1_Model1
fco2_xgb_select_best_passo4 <- fco2_xgb_tune_grid %>% select_best(metric = "rmse")
fco2_xgb_select_best_passo4
#> # A tibble: 1 x 3
#> mtry sample_size .config
#> <dbl> <dbl> <chr>
#> 1 1 1 Preprocessor1_Model4
Agora vamos tunar o learn_rate
e o trees de novo, mas deixando o
learn_rate
assumir valores menores.
fco2_xgb_model <- boost_tree(
mtry = fco2_xgb_select_best_passo4$mtry,
trees = tune(),
min_n = fco2_xgb_select_best_passo2$min_n,
tree_depth = fco2_xgb_select_best_passo2$tree_depth,
loss_reduction = fco2_xgb_select_best_passo3$loss_reduction,
learn_rate = tune(),
sample_size = fco2_xgb_select_best_passo4$sample_size
) |>
set_mode("regression") %>%
set_engine("xgboost", nthread = cores, counts = FALSE)
#### Workflow
fco2_xgb_wf <- workflow() %>%
add_model(fco2_xgb_model) %>%
add_recipe(fco2_xgb_recipe)
#### Grid
fco2_xgb_grid <- expand.grid(
learn_rate = c(0.05, 0.10, 0.15, 0.25),
trees = c(100, 250, 500)
)
fco2_xgb_tune_grid <- fco2_xgb_wf %>%
tune_grid(
resamples = fco2_resamples,
grid = fco2_xgb_grid,
control = control_grid(save_pred = TRUE, verbose = FALSE, allow_par = TRUE),
metrics = metric_set(rmse)
)
#### Melhores hiperparâmetros
autoplot(fco2_xgb_tune_grid)
fco2_xgb_tune_grid %>% show_best(metric = "rmse", n = 5)
#> # A tibble: 5 x 8
#> trees learn_rate .metric .estimator mean n std_err .config
#> <dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
#> 1 500 0.25 rmse standard 1.13 10 0.0307 Preprocessor1_Model12
#> 2 500 0.15 rmse standard 1.14 10 0.0292 Preprocessor1_Model09
#> 3 250 0.25 rmse standard 1.14 10 0.0268 Preprocessor1_Model11
#> 4 250 0.15 rmse standard 1.15 10 0.0271 Preprocessor1_Model08
#> 5 500 0.1 rmse standard 1.15 10 0.0288 Preprocessor1_Model06
fco2_xgb_select_best_passo5 <- fco2_xgb_tune_grid %>% select_best(metric = "rmse")
fco2_xgb_select_best_passo5
#> # A tibble: 1 x 3
#> trees learn_rate .config
#> <dbl> <dbl> <chr>
#> 1 500 0.25 Preprocessor1_Model12
fco2_xgb_model <- boost_tree(
mtry = fco2_xgb_select_best_passo4$mtry,
trees = fco2_xgb_select_best_passo5$trees,
min_n = fco2_xgb_select_best_passo2$min_n,
tree_depth = fco2_xgb_select_best_passo2$tree_depth,
loss_reduction = fco2_xgb_select_best_passo3$loss_reduction,
learn_rate = fco2_xgb_select_best_passo5$learn_rate,
sample_size = fco2_xgb_select_best_passo4$sample_size
) %>%
set_mode("regression") %>%
set_engine("xgboost", nthread = cores, counts = FALSE)
# fco2_xgb_best_params <- select_best(fco2_xgb_tune_grid, "rmse")
df <- data.frame(
mtry = fco2_xgb_select_best_passo4$mtry,
trees = fco2_xgb_select_best_passo5$trees,
min_n = fco2_xgb_select_best_passo2$min_n,
tree_depth = fco2_xgb_select_best_passo2$tree_depth,
loss_reduction = fco2_xgb_select_best_passo3$loss_reduction,
learn_rate = fco2_xgb_select_best_passo5$learn_rate,
sample_size = fco2_xgb_select_best_passo4$sample_size
)
fco2_xgb_wf <- fco2_xgb_wf %>% finalize_workflow(df) # <------
fco2_xgb_last_fit <- last_fit(fco2_xgb_wf, fco2_initial_split) # <--------
fco2_test_preds <- bind_rows(
collect_predictions(fco2_xgb_last_fit) %>% mutate(modelo = "xgb")
)
fco2_test_preds %>%
ggplot(aes(x=.pred, y=FCO2)) +
geom_point()+
theme_bw() +
geom_smooth(method = "lm") +
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")))
fco2_xgb_last_fit_model <-fco2_xgb_last_fit$.workflow[[1]]$fit$fit
vip(fco2_xgb_last_fit_model)
da <- fco2_test_preds %>%
filter(FCO2 > 0, .pred> 0 )
my_rmse <- Metrics::rmse(da$FCO2,
da$.pred)
my_mape <- Metrics::mape(da$FCO2,da$.pred)*100
fco2_test_preds %>%
ggplot(aes(x=FCO2,y=.pred))+
geom_point()+
geom_smooth(method = "lm")+
stat_regline_equation(ggplot2::aes(
label = paste(..eq.label.., ..rr.label.., sep = "*plain(\",\")~~")),size=5)+
ggplot2::annotate('text',x=10.4,y=16.7,label=paste0('RMSE = ',round(my_rmse,2),', MAPE = '
,round(my_mape,2),'%'),size=5)+
theme_bw()