import pandas as pd
import os
import xarray as xr
import hashlib
import plotly.graph_objects as go

from pycaret.time_series import TSForecastingExperiment
from datetime import datetime, timedelta
from tqdm import tqdm
from glob import glob
from math import dist

def get_id_cell(row, distance_in):
    grid_distance = str(distance_in)[:3].replace('.','_')
    id_cell = f'{grid_distance}__{row["latitude"]}_{row["longitude"]}'
    result = hashlib.md5(id_cell.encode()).hexdigest()
    return result

cams_nc_path = os.path.join(os.path.expanduser('~'), 'ILAB_DATA', 'COPERNICUS','NC')
cams_data_path = os.path.join(os.path.expanduser('~'), 'ILAB_DATA', 'COPERNICUS','DATA')

file_list = glob(os.path.join(cams_nc_path,'*.nc'))
file_list.sort()
tot = len(file_list)
output = list()
for i,file in enumerate(file_list,start=1):
    head, tail = os.path.split(file)
    fname = tail.replace('.nc','').split('_')
    loc = fname[1]
    if i==1:
        start_m = fname[2]
    elif i==tot:
        end_m = fname[2]
    output.append((head, tail))
print(start_m,end_m)

cell_sel = '54cfb985fad9555c04e4b4135a471259'

out = pd.DataFrame()
for file in tqdm(output,total=len(output)):
    cams_data_fname = os.path.join(file[0],file[1])
    ds = xr.open_dataset(cams_data_fname)
    df = ds.to_dataframe().reset_index()
    ds.close()
    df.columns = [col.replace('_conc','') for col in df.columns]
    variables = list(df.columns)
    variables = [element for element in variables if element not in ['longitude','latitude','level','time']]
    aggs_dict = {element: 'mean' for element in variables}
    timestart = datetime.strptime(file[1].split('_')[2].split('.')[0], '%Y%m')
    timeend = datetime.strptime((timestart + timedelta(days=31)).strftime('%Y-%m-01'),'%Y-%m-%d')
    df['time_abs'] = timestart + df['time']   
    df = df[df['time_abs']<timeend].copy()
    lons = df[df['time_abs']==timestart].sort_values(by=['latitude','longitude']).head(2)['longitude'].values
    lats = df[df['time_abs']==timestart].sort_values(by=['latitude','longitude']).head(2)['latitude'].values
    distance = dist([lons[1]], [lons[0]])
    df['id_cell'] = df.apply(get_id_cell, distance_in=distance, axis='columns')
    df.drop(columns=['longitude', 'latitude', 'level', 'time'], inplace=True)
    d_columns = ['pm2p5_total_om','ecres','ectot']
    for d_column in d_columns:
        if d_column in df.columns:
            del df[d_column]
    df_cell = df[df['id_cell']==cell_sel].copy()
    out = pd.concat([out,df_cell])
out.sort_values(by='time_abs',inplace=True)

print(out['time_abs'].min(), out['time_abs'].max())

fig = go.Figure()
fig.add_trace(go.Scatter(x=out['time_abs'],y=out['pm2p5'],name='pm2p5'))
fig.show()

day = '2021-05-01'
day_start = datetime.strptime(day,'%Y-%m-%d').date()
df_giorno = out[out['time_abs'].dt.date == day_start]

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_giorno['time_abs'],y=df_giorno['pm2p5'],name='pm2p5'))
fig.show()

day_end = day_start + timedelta(days=31)
df_week = out[(out['time_abs'].dt.date >= day_start) & (out['time_abs'].dt.date < day_end)]

fig = go.Figure()
fig.add_trace(go.Scatter(x=df_week['time_abs'],y=df_week['pm2p5'],name='pm2p5'))
fig.show()

out.set_index('time_abs', inplace=True)

target = 'pm2p5'
exog_vars = ['co', 'nh3', 'no2', 'o3', 'pm10','pmwf', 'so2']
include =  [target] + exog_vars
y = out[include]
y.head()

#y = out['pm2p5']
#y = y.resample('1D').mean()
y = y.resample('1H').mean().ffill()

y = y.loc['2024-05-01':].copy()

#y = y.loc['2024-04-01':'2024-06-01'].copy()

FH=48
metric = "mase"
exclude = ["auto_arima", "bats", "tbats", "lar_cds_dt", "par_cds_dt"]

exp_auto = TSForecastingExperiment()

# enforce_exogenous=False --> Use multivariate forecasting when model supports it, else use univariate forecasting
exp_auto.setup(
    data=y, target=target, fh=FH, enforce_exogenous=False,
    numeric_imputation_target="ffill", numeric_imputation_exogenous="ffill",session_id=42
)

# 3min29s dal 15maggio - 7min22 dal 01 maggio
# Include slower models like Prophet (turbo=False), but exclude some specific models ----
best = exp_auto.compare_models(sort=metric, turbo=False, exclude=exclude)

exp_auto.plot_model(best)

final_auto_model = exp_auto.finalize_model(best)

# 19min dal 15maggio - dal 01maggio

exo_vars = exp_auto.exogenous_variables
print(f"{len(exo_vars)} exogenous variables (X) needed in order to make future predictions:\n{exo_vars}")

exog_exps = []
exog_models = []
for exog_var in exog_vars:
    exog_exp = TSForecastingExperiment()
    exog_exp.setup(
        data=y[exog_var], fh=FH,
        numeric_imputation_target="ffill", numeric_imputation_exogenous="ffill",session_id=42
    )

    # Users can customize how to model future exogenous variables i.e. add
    # more steps and models to potentially get better models at the expense
    # of higher modeling time.
    best = exog_exp.compare_models(
        sort=metric, include=["arima", "ets", "exp_smooth", "theta", "lightgbm_cds_dt",]        
    )
    final_exog_model = exog_exp.finalize_model(best)

    exog_exps.append(exog_exp)
    exog_models.append(final_exog_model)

# Step 2: Get future predictions for exog variables ----
future_exog = [
    exog_exp.predict_model(exog_model)
    for exog_exp, exog_model in zip(exog_exps, exog_models)
]
future_exog = pd.concat(future_exog, axis=1)
future_exog.columns = exog_vars

future_preds = exp_auto.predict_model(final_auto_model, X=future_exog)

exp_auto.plot_model(final_auto_model, data_kwargs={'X': future_exog})

_ = exp_auto.save_model(final_auto_model, "my_blender")

loaded_exp = TSForecastingExperiment()
m = loaded_exp.load_model("my_blender")
# Predictions should be same as before the model was saved and loaded
loaded_exp.predict_model(m)

ATTENZIONE¶

Preprocessing - Da raster a csv¶

Visualizzazioni time trend della variabile pm2.5¶

Andamento orario nei 4 anni¶

Andamento orario per il mese di maggio 2021¶

Auto Create¶

Include slower models like Prophet (turbo=False), but exclude some specific models¶

Finalize Model¶

Save Model¶

Load Model¶