Aqui vou trabalhar com um problema de séries temporais com o objetivo de realizar futuramente a previsão do clima médio nos próximos meses.

EDA Realizada Anteriormente

EDA

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

df = pd.read_csv("daily_serie_train.csv")

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          1462 non-null   object 
 1   meantemp      1462 non-null   float64
 2   humidity      1462 non-null   float64
 3   wind_speed    1462 non-null   float64
 4   meanpressure  1462 non-null   float64
dtypes: float64(4), object(1)
memory usage: 57.2+ KB

df.isna().sum()

date            0
meantemp        0
humidity        0
wind_speed      0
meanpressure    0
dtype: int64

df.date = pd.DatetimeIndex(df.date.values)
df = df.set_index('date')
df.head()

df.describe()

def remove_outliers(df, col):
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3-q1
  df = df[(df[col]>(q1-1.5*iqr)) & (df[col]<(q3+1.5*iqr))]
  return df

for col in ["wind_speed","meanpressure"]:
  df=remove_outliers(df, col)

df[['meanpressure']].boxplot()
plt.show()

selected_features = ['humidity', 'wind_speed']
target = ['meantemp']
df = df[selected_features + target]

train = df[df.index<"2016-08-01"] 
test = df[df.index>="2016-08-01"]

train['month'] = train.index.month
train = train.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

test['month'] = test.index.month
test = test.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

train.drop(columns=['month','month_month'], inplace=True)
test.drop(columns=['month','month_month'], inplace=True)

from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()

train = pd.DataFrame(data = min_max.fit_transform(train), columns=train.columns)

test = pd.DataFrame(data =min_max.transform(test), columns=test.columns)

train.head()

for idx, column in enumerate(df.columns):
  fig, axs = plt.subplots(1,2)
  fig.set_size_inches(10,5)
  sns.boxplot(df[column], ax=axs[0])
  sns.histplot(df[column], ax=axs[1], kde=True)
  plt.show()

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning

df.resample("M").mean().head()

df.resample("M").mean().meantemp.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x7fdeb696b390>

df.resample("H").ffill().head()

df.resample("H").bfill()

df.plot(subplots=True, figsize=(15,20))

array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fdeb689d690>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fdeb684bbd0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fdeb6892c90>],
      dtype=object)

df.corr()['humidity_lag']=df.humidity.shift()

df.dropna(inplace=True)

corr=df.corr()
sns.heatmap(data=corr,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7fdeb6a10750>

sns.pairplot(df)
plt.title('Dispersão de pares')

Text(0.5, 1.0, 'Dispersão de pares')

from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df.meantemp, freq=365)
ax = result.plot()
ax.set_size_inches(15,20)
plt.show()

/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

temp_log = np.log(df['meantemp'])

rolmean_log = temp_log.rolling(window=7).mean()
rolstd = temp_log.rolling(window=7).std()

fig, ax = plt.subplots(figsize=(15,10))

orig = plt.plot(temp_log, color = 'blue', label = 'Transformação Logarítmica')
mean = plt.plot(rolmean_log, color = 'red', label = 'Média Móvel de Transformação')
desvio = plt.plot(rolstd, color = 'black', label = 'Desvio Padrão Móvel')

plt.legend(loc='best')
plt.title('Estatísticas de rolagem - Log')
ax.xaxis_date()
fig.autofmt_xdate()
plt.show(block=False)

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf

print('Resultados: Dickey Fuller test: ')
adftest = adfuller(df['meantemp'])

out = pd.Series(adftest[0:4], index=['Teste', 'p-valor', 'Lags', 'Numero de observações usadas'])
for key,value in adftest[4].items():
  out['Valor Crítico (%s): ' % key] = value
print(out)

Resultados: Dickey Fuller test: 
Teste                             -2.031594
p-valor                            0.272894
Lags                              10.000000
Numero de observações usadas    1412.000000
Valor Crítico (1%):               -3.434990
Valor Crítico (5%):               -2.863589
Valor Crítico (10%):              -2.567861
dtype: float64

log_menos_media = temp_log - rolmean_log
log_menos_media.dropna(inplace=True)

rolmean_log = log_menos_media.rolling(window=7).mean()
rolstd = log_menos_media.rolling(window=7).std()

fig, ax = plt.subplots(figsize=(15,10))

orig = plt.plot(log_menos_media, color = 'blue', label = 'Transformação Logarítmica')
mean = plt.plot(rolmean_log, color = 'red', label = 'Média Móvel de Transformação')
desvio = plt.plot(rolstd, color = 'black', label = 'Desvio Padrão Móvel')

plt.legend(loc='best')
plt.title('Estatísticas de rolagem - Log')
ax.xaxis_date()
fig.autofmt_xdate()
plt.show(block=False)

print('Resultados: Dickey Fuller test: ')
adftest = adfuller(log_menos_media, autolag = 'AIC')

dfoutput = pd.Series(adftest[0:4], index=['Teste', 'p-valor', '#Lags', 'Numero de observações usadas'])
for key,value in adftest[4].items():
  dfoutput['Valor Crítico (%s): ' % key] = value
print(dfoutput)

Resultados: Dickey Fuller test: 
Teste                          -1.354640e+01
p-valor                         2.456760e-25
#Lags                           8.000000e+00
Numero de observações usadas    1.408000e+03
Valor Crítico (1%):            -3.435003e+00
Valor Crítico (5%):            -2.863595e+00
Valor Crítico (10%):           -2.567864e+00
dtype: float64

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(df.meantemp)
plot_pacf(df.meantemp)
plt.show()

Conclusão

A umidade pode ser uma variável. pois possui correlação fraca com a média de temperatura

	date	meantemp	humidity	wind_speed	meanpressure
0	2013-01-01	10.000000	84.500000	0.000000	1015.666667
1	2013-01-02	7.400000	92.000000	2.980000	1017.800000
2	2013-01-03	7.166667	87.000000	4.633333	1018.666667
3	2013-01-04	8.666667	71.333333	1.233333	1017.166667
4	2013-01-05	6.000000	86.833333	3.700000	1016.500000

	meantemp	humidity	wind_speed	meanpressure
date
2013-01-01	10.000000	84.500000	0.000000	1015.666667
2013-01-02	7.400000	92.000000	2.980000	1017.800000
2013-01-03	7.166667	87.000000	4.633333	1018.666667
2013-01-04	8.666667	71.333333	1.233333	1017.166667
2013-01-05	6.000000	86.833333	3.700000	1016.500000

	meantemp	humidity	wind_speed	meanpressure
count	1462.000000	1462.000000	1462.000000	1462.000000
mean	25.495521	60.771702	6.802209	1011.104548
std	7.348103	16.769652	4.561602	180.231668
min	6.000000	13.428571	0.000000	-3.041667
25%	18.857143	50.375000	3.475000	1001.580357
50%	27.714286	62.625000	6.221667	1008.563492
75%	31.305804	72.218750	9.238235	1014.944901
max	38.714286	100.000000	42.220000	7679.333333

	humidity	wind_speed	meantemp	meantemp_month
0	0.840372	0.000000	0.122271	0.140025
1	0.929054	0.167181	0.042795	0.140025
2	0.869932	0.259935	0.035662	0.140025
3	0.684685	0.069191	0.081514	0.140025
4	0.867962	0.207574	0.000000	0.140025

	humidity	wind_speed	meantemp
date
2013-01-31	73.028802	4.833913	12.074770
2013-02-28	71.938563	7.474090	16.867560
2013-03-31	57.686706	8.246956	22.996905
2013-04-30	34.612103	8.046385	28.895119
2013-05-31	28.938249	8.943452	33.776767

	humidity	wind_speed	meantemp
date
2013-01-01 00:00:00	84.5	0.0	10.0
2013-01-01 01:00:00	84.5	0.0	10.0
2013-01-01 02:00:00	84.5	0.0	10.0
2013-01-01 03:00:00	84.5	0.0	10.0
2013-01-01 04:00:00	84.5	0.0	10.0

	humidity	wind_speed	meantemp
date
2013-01-01 00:00:00	84.5	0.00	10.0
2013-01-01 01:00:00	92.0	2.98	7.4
2013-01-01 02:00:00	92.0	2.98	7.4
2013-01-01 03:00:00	92.0	2.98	7.4
2013-01-01 04:00:00	92.0	2.98	7.4
...	...	...	...
2016-12-31 20:00:00	100.0	0.00	10.0
2016-12-31 21:00:00	100.0	0.00	10.0
2016-12-31 22:00:00	100.0	0.00	10.0
2016-12-31 23:00:00	100.0	0.00	10.0
2017-01-01 00:00:00	100.0	0.00	10.0