Open In Colab

Aqui vou trabalhar com um problema de séries temporais com o objetivo de realizar futuramente a previsão do clima médio nos próximos meses.

Desafio Clima Kaggel

EDA Realizada Anteriormente

EDA

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
df = pd.read_csv("daily_serie_train.csv")
df.head()
date meantemp humidity wind_speed meanpressure
0 2013-01-01 10.000000 84.500000 0.000000 1015.666667
1 2013-01-02 7.400000 92.000000 2.980000 1017.800000
2 2013-01-03 7.166667 87.000000 4.633333 1018.666667
3 2013-01-04 8.666667 71.333333 1.233333 1017.166667
4 2013-01-05 6.000000 86.833333 3.700000 1016.500000
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          1462 non-null   object 
 1   meantemp      1462 non-null   float64
 2   humidity      1462 non-null   float64
 3   wind_speed    1462 non-null   float64
 4   meanpressure  1462 non-null   float64
dtypes: float64(4), object(1)
memory usage: 57.2+ KB
df.isna().sum()
date            0
meantemp        0
humidity        0
wind_speed      0
meanpressure    0
dtype: int64
df.date = pd.DatetimeIndex(df.date.values)
df = df.set_index('date')
df.head()
meantemp humidity wind_speed meanpressure
date
2013-01-01 10.000000 84.500000 0.000000 1015.666667
2013-01-02 7.400000 92.000000 2.980000 1017.800000
2013-01-03 7.166667 87.000000 4.633333 1018.666667
2013-01-04 8.666667 71.333333 1.233333 1017.166667
2013-01-05 6.000000 86.833333 3.700000 1016.500000
df.describe()
meantemp humidity wind_speed meanpressure
count 1462.000000 1462.000000 1462.000000 1462.000000
mean 25.495521 60.771702 6.802209 1011.104548
std 7.348103 16.769652 4.561602 180.231668
min 6.000000 13.428571 0.000000 -3.041667
25% 18.857143 50.375000 3.475000 1001.580357
50% 27.714286 62.625000 6.221667 1008.563492
75% 31.305804 72.218750 9.238235 1014.944901
max 38.714286 100.000000 42.220000 7679.333333
def remove_outliers(df, col):
  q1 = df[col].quantile(0.25)
  q3 = df[col].quantile(0.75)
  iqr = q3-q1
  df = df[(df[col]>(q1-1.5*iqr)) & (df[col]<(q3+1.5*iqr))]
  return df
for col in ["wind_speed","meanpressure"]:
  df=remove_outliers(df, col)
df[['meanpressure']].boxplot()
plt.show()
selected_features = ['humidity', 'wind_speed']
target = ['meantemp']
df = df[selected_features + target]
train = df[df.index<"2016-08-01"] 
test = df[df.index>="2016-08-01"]
train['month'] = train.index.month
train = train.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
test['month'] = test.index.month
test = test.join(train.groupby('month').agg({'meantemp':'mean'}).reset_index(), on='month', rsuffix='_month')
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
train.drop(columns=['month','month_month'], inplace=True)
test.drop(columns=['month','month_month'], inplace=True)
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
train = pd.DataFrame(data = min_max.fit_transform(train), columns=train.columns)
test = pd.DataFrame(data =min_max.transform(test), columns=test.columns)
train.head()
humidity wind_speed meantemp meantemp_month
0 0.840372 0.000000 0.122271 0.140025
1 0.929054 0.167181 0.042795 0.140025
2 0.869932 0.259935 0.035662 0.140025
3 0.684685 0.069191 0.081514 0.140025
4 0.867962 0.207574 0.000000 0.140025
for idx, column in enumerate(df.columns):
  fig, axs = plt.subplots(1,2)
  fig.set_size_inches(10,5)
  sns.boxplot(df[column], ax=axs[0])
  sns.histplot(df[column], ax=axs[1], kde=True)
  plt.show()
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  FutureWarning
df.resample("M").mean().head()
humidity wind_speed meantemp
date
2013-01-31 73.028802 4.833913 12.074770
2013-02-28 71.938563 7.474090 16.867560
2013-03-31 57.686706 8.246956 22.996905
2013-04-30 34.612103 8.046385 28.895119
2013-05-31 28.938249 8.943452 33.776767
df.resample("M").mean().meantemp.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x7fdeb696b390>
df.resample("H").ffill().head()
humidity wind_speed meantemp
date
2013-01-01 00:00:00 84.5 0.0 10.0
2013-01-01 01:00:00 84.5 0.0 10.0
2013-01-01 02:00:00 84.5 0.0 10.0
2013-01-01 03:00:00 84.5 0.0 10.0
2013-01-01 04:00:00 84.5 0.0 10.0
df.resample("H").bfill()
humidity wind_speed meantemp
date
2013-01-01 00:00:00 84.5 0.00 10.0
2013-01-01 01:00:00 92.0 2.98 7.4
2013-01-01 02:00:00 92.0 2.98 7.4
2013-01-01 03:00:00 92.0 2.98 7.4
2013-01-01 04:00:00 92.0 2.98 7.4
... ... ... ...
2016-12-31 20:00:00 100.0 0.00 10.0
2016-12-31 21:00:00 100.0 0.00 10.0
2016-12-31 22:00:00 100.0 0.00 10.0
2016-12-31 23:00:00 100.0 0.00 10.0
2017-01-01 00:00:00 100.0 0.00 10.0

35065 rows × 3 columns

df.plot(subplots=True, figsize=(15,20))
array([<matplotlib.axes._subplots.AxesSubplot object at 0x7fdeb689d690>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fdeb684bbd0>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x7fdeb6892c90>],
      dtype=object)
df.corr()['humidity_lag']=df.humidity.shift()
df.dropna(inplace=True)
corr=df.corr()
sns.heatmap(data=corr,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x7fdeb6a10750>
sns.pairplot(df)
plt.title('Dispersão de pares')
Text(0.5, 1.0, 'Dispersão de pares')
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df.meantemp, freq=365)
ax = result.plot()
ax.set_size_inches(15,20)
plt.show()
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
temp_log = np.log(df['meantemp'])
rolmean_log = temp_log.rolling(window=7).mean()
rolstd = temp_log.rolling(window=7).std()

fig, ax = plt.subplots(figsize=(15,10))

orig = plt.plot(temp_log, color = 'blue', label = 'Transformação Logarítmica')
mean = plt.plot(rolmean_log, color = 'red', label = 'Média Móvel de Transformação')
desvio = plt.plot(rolstd, color = 'black', label = 'Desvio Padrão Móvel')

plt.legend(loc='best')
plt.title('Estatísticas de rolagem - Log')
ax.xaxis_date()
fig.autofmt_xdate()
plt.show(block=False)
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf

print('Resultados: Dickey Fuller test: ')
adftest = adfuller(df['meantemp'])

out = pd.Series(adftest[0:4], index=['Teste', 'p-valor', 'Lags', 'Numero de observações usadas'])
for key,value in adftest[4].items():
  out['Valor Crítico (%s): ' % key] = value
print(out)
Resultados: Dickey Fuller test: 
Teste                             -2.031594
p-valor                            0.272894
Lags                              10.000000
Numero de observações usadas    1412.000000
Valor Crítico (1%):               -3.434990
Valor Crítico (5%):               -2.863589
Valor Crítico (10%):              -2.567861
dtype: float64
log_menos_media = temp_log - rolmean_log
log_menos_media.dropna(inplace=True)
rolmean_log = log_menos_media.rolling(window=7).mean()
rolstd = log_menos_media.rolling(window=7).std()

fig, ax = plt.subplots(figsize=(15,10))

orig = plt.plot(log_menos_media, color = 'blue', label = 'Transformação Logarítmica')
mean = plt.plot(rolmean_log, color = 'red', label = 'Média Móvel de Transformação')
desvio = plt.plot(rolstd, color = 'black', label = 'Desvio Padrão Móvel')

plt.legend(loc='best')
plt.title('Estatísticas de rolagem - Log')
ax.xaxis_date()
fig.autofmt_xdate()
plt.show(block=False)
print('Resultados: Dickey Fuller test: ')
adftest = adfuller(log_menos_media, autolag = 'AIC')

dfoutput = pd.Series(adftest[0:4], index=['Teste', 'p-valor', '#Lags', 'Numero de observações usadas'])
for key,value in adftest[4].items():
  dfoutput['Valor Crítico (%s): ' % key] = value
print(dfoutput)
Resultados: Dickey Fuller test: 
Teste                          -1.354640e+01
p-valor                         2.456760e-25
#Lags                           8.000000e+00
Numero de observações usadas    1.408000e+03
Valor Crítico (1%):            -3.435003e+00
Valor Crítico (5%):            -2.863595e+00
Valor Crítico (10%):           -2.567864e+00
dtype: float64
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(df.meantemp)
plot_pacf(df.meantemp)
plt.show()

Conclusão

  • A umidade pode ser uma variável. pois possui correlação fraca com a média de temperatura