import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 100)

from pyspark.sql.functions import col, asc

Starting Spark application

SparkSession available as 'spark'.


# # Read wearther data from years 2000-2016 
weather = spark.read.csv('s3://aws-gsod/20*/*.csv', 
                         header=True, inferSchema=True)


from pyspark.sql.functions import col, asc
import gc

# Filter PH weather data only
columns = ['ID', 'Year', 'Month', 'Day', 
           'Mean_Temp', 'Mean_Dewpoint', 
           'Mean_Sea_Level_Pressure',
           'Mean_Station_Pressure',
           'Mean_Visibility', 'Mean_Windspeed', 
           'Max_Windspeed', 'Max_Gust',
           'Max_Temp', 'Min_Temp', 'Precipitation',
           'Rain_or_Drizzle', 'Fog', 'Thunder','Tornado']

df_ph = weather.where((col("Country_Code") == 'RP')).select(columns)
df_ph = df_ph.fillna(0).persist()

#release resources used on weather RDD
del weather
gc.collect()

print(f'Total Weather Observations (PH): {df_ph.count():,}')

Total Weather Observations (PH): 310,983


# Convert to pandas dataframe for faster processing
df = df_ph.toPandas()
df.head()

             ID  Year  Month  Day  Mean_Temp  Mean_Dewpoint  \
0  986460-99999  2014      1    1       79.3           75.2   
1  986460-99999  2014      1    2       79.9           75.3   
2  986460-99999  2014      1    3       78.8           74.6   
3  986460-99999  2014      1    4       81.2           74.8   
4  986460-99999  2014      1    5       79.8           74.7   

   Mean_Sea_Level_Pressure  Mean_Station_Pressure  Mean_Visibility  \
0                   1011.7                 1008.8              5.6   
1                   1011.7                 1008.9              6.1   
2                   1011.6                 1008.8              6.0   
3                   1009.6                 1007.2              6.2   
4                   1010.0                 1007.3              6.1   

   Mean_Windspeed  Max_Windspeed  Max_Gust  Max_Temp  Min_Temp  Precipitation  \
0            10.0           15.9       0.0      87.1      75.2           1.46   
1             8.5           15.0       0.0      86.4      74.8           0.00   
2             5.5            9.9       0.0      82.4      75.2           0.00   
3             6.7           14.0       0.0      86.7      77.0           0.20   
4             6.9           15.9       0.0      86.0      73.4           0.00   

   Rain_or_Drizzle  Fog  Thunder  Tornado  
0                1    0        0        0  
1                1    0        0        0  
2                1    0        0        0  
3                0    0        0        0  
4                1    0        0        0


#temporary view to query file as SQL 
df_ph.createOrReplaceTempView('weatherph')


# Total Yearly Rain Occurences
plt.subplots(figsize=(12,4))
sns.countplot(x="Year", hue="Rain_or_Drizzle", data=df)
plt.xlabel("Year")

plt.ylabel("Frequency of Rain")
plt.title("Total Yearly Rain Occurence in the Philippines");
%matplot plt


# Get unique number of weather stations in the country
spark.sql("""SELECT count(distinct ID) as total_weatherstations
            from weatherph""").show()

+---------------------+
|total_weatherstations|
+---------------------+
|                   72|
+---------------------+


# top 10 weather stations in PH based on rain frequency
station_yearly = (df_ph.groupby(['ID', 'Year']).sum('Rain_or_Drizzle')
                  .groupby(['ID']).mean('sum(Rain_or_Drizzle)')
                  .select(col("ID")
                      ,col("avg(sum(Rain_or_Drizzle))").alias("Average")) 
                  .orderBy('Average', ascending=False).toPandas())
plt.subplots(figsize=(12,4))
sns.barplot(x="Average", y="ID", data=station_yearly.iloc[:10,:])
# plt.xlabel("Year")
# plt.ylabel("Frequency of Rain")
plt.title("Top 10 Stations Based on Average Yearly Rain Occurence");
%matplot plt


import pyspark.sql.functions as F 
#Get auth summary
proportion = df_ph.groupby('Rain_or_Drizzle').agg(F.count('Id')).toPandas()
plt.subplots(figsize=(8,5))
sns.barplot(x="Rain_or_Drizzle", y="count(Id)", data=proportion)
plt.xlabel("Rain Indicator")
plt.ylabel("Count")
plt.title("Distribution of Rain Occurence in the Philippines");
%matplot plt


# table statistics
df.describe().transpose()

                            count         mean         std     min     25%  \
Year                     310983.0  2008.044446    4.697303  2000.0  2004.0   
Month                    310983.0     6.413035    3.430157     1.0     3.0   
Day                      310983.0    15.682131    8.793600     1.0     8.0   
Mean_Temp                310983.0    80.852167    4.053744    25.3    79.3   
Mean_Dewpoint            310983.0    73.685832   10.144951   -18.5    73.6   
Mean_Sea_Level_Pressure  310983.0   985.280006  155.886064     0.0  1008.2   
Mean_Station_Pressure    310983.0   280.047105  444.767931     0.0     0.0   
Mean_Visibility          310983.0     9.886989    3.189833     0.0     7.1   
Mean_Windspeed           310983.0     4.157829    2.873096     0.0     2.2   
Max_Windspeed            310983.0     7.718089    4.508868     0.0     3.9   
Max_Gust                 310983.0     0.424989    3.187404     0.0     0.0   
Max_Temp                 310983.0    87.897557    4.942869     0.0    85.6   
Min_Temp                 310983.0    74.256564    4.384074     0.0    73.0   
Precipitation            310983.0     0.268629    0.749445     0.0     0.0   
Rain_or_Drizzle          310983.0     0.377667    0.484804     0.0     0.0   
Fog                      310983.0     0.033140    0.179003     0.0     0.0   
Thunder                  310983.0     0.126711    0.332650     0.0     0.0   
Tornado                  310983.0     0.000244    0.015631     0.0     0.0   

                            50%     75%     max  
Year                     2008.0  2012.0  2016.0  
Month                       6.0     9.0    12.0  
Day                        16.0    23.0    31.0  
Mean_Temp                  81.5    83.3   107.1  
Mean_Dewpoint              75.8    77.2    87.1  
Mean_Sea_Level_Pressure  1009.7  1011.3  1042.2  
Mean_Station_Pressure       0.0   940.8  1023.4  
Mean_Visibility             9.8    12.4    25.3  
Mean_Windspeed              3.6     5.5    40.6  
Max_Windspeed               7.8     9.7    73.8  
Max_Gust                    0.0     0.0    76.9  
Max_Temp                   88.7    91.0   117.5  
Min_Temp                   75.2    77.0    96.8  
Precipitation               0.0     0.2    19.5  
Rain_or_Drizzle             0.0     1.0     1.0  
Fog                         0.0     0.0     1.0  
Thunder                     0.0     0.0     1.0  
Tornado                     0.0     0.0     1.0


# Get correlations of numerical features
plt.subplots(figsize=(20,12))
sns.heatmap(df.corr())
plt.xticks(rotation=20)
%matplot plt


# DROP ID, HIGHLY CORRELATED FEATURES SUCH AS TEMP AND WINDSPEED
columns = ['ID', 'Max_Windspeed', 'Max_Temp', 'Min_Temp']
model_df = df_ph.drop('ID', 'Max_Windspeed', 'Max_Temp', 'Min_Temp')


from pyspark.ml.feature import VectorAssembler

# Assemble numerical features
assembler = VectorAssembler(inputCols=['Year', 'Month', 'Day', 
                               'Mean_Temp', 'Mean_Dewpoint', 
                               'Mean_Sea_Level_Pressure',
                               'Mean_Station_Pressure',
                               'Mean_Visibility', 'Mean_Windspeed',
                               'Max_Gust',
                               'Precipitation',
                               'Fog', 'Thunder','Tornado'],
                            outputCol='NumFeatures',handleInvalid = 'skip')
data = assembler.transform(df_ph)

# Use 'Rain_or_Drizzle' as model prediction label
data = data.withColumnRenamed('Rain_or_Drizzle','label')

# Split train/test data
train, test = data.randomSplit([0.8, 0.2], seed=42)


from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression,GBTClassifier,LinearSVC, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

def fit_model(model, paramGrid = None):
    """Return fitted model and prediction on test set"""
    pipeline = Pipeline(stages=[standardscaler, model])
    
    if paramGrid != None:
        crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=3)
        fitmodel = crossval.fit(train)
    else:
        fitmodel = pipeline.fit(train)
    
    results = fitmodel.transform(test)
    
    return fitmodel, results

from pyspark.sql.types import IntegerType, DoubleType

# Evaluate the model on test set
def val_evaluation(model, results):
    """Return accuracy, precision and recall scores of the model"""
    predictionAndLabels = results.select(['prediction', 'label']\
                                      ).withColumn('label',col('label')
                                      .cast(DoubleType())).rdd

    metrics = MulticlassMetrics(predictionAndLabels)
    cm=metrics.confusionMatrix().toArray()

    accuracy=(cm[0][0]+cm[1][1])/cm.sum()
    precision=(cm[1][1])/(cm[0][1]+cm[1][1])
    recall=(cm[1][1])/(cm[1][0]+cm[1][1])
    f1 = MulticlassClassificationEvaluator().evaluate(results)
    return (model, round(f1,2), round(accuracy,2),round(precision,2),round(recall,2))


# Standard Scaler
standardscaler = StandardScaler(inputCol="NumFeatures", outputCol="features", withMean=True, withStd=True)

# Evaluate different models
lr = LogisticRegression(maxIter=10, regParam=0.0)
gbt = GBTClassifier(maxIter=5, maxDepth=2)
lsvc = LinearSVC(maxIter=10, regParam=0.1)
rf = RandomForestClassifier()

# Fit models 
lrmodel, lrresults = fit_model(lr)
rfmodel, rfresults = fit_model(rf)
lsvcmodel, lsvcresults = fit_model(lsvc)
gbtmodel, gbtresults = fit_model(gbt)


# Evaluate and summarize scores between models
scores = []
scores.append(val_evaluation("LogisticRegression", lrresults))
scores.append(val_evaluation("RandomForest", rfresults))
scores.append(val_evaluation("LSVC", lsvcresults))
scores.append(val_evaluation("GBT", gbtresults))
cols = ['model', 'f1', 'accuracy', 'precision', 'recall']
pd.DataFrame(scores, columns=cols)

                model    f1  accuracy  precision  recall
0  LogisticRegression  0.71      0.73       0.71    0.46
1        RandomForest  0.76      0.77       0.71    0.64
2                LSVC  0.65      0.69       0.69    0.32
3                 GBT  0.75      0.76       0.71    0.59


# Extract feature names from the original data
dict_feats = data.schema['NumFeatures'].metadata['ml_attr']['attrs']['numeric']
list_feats = np.array([x['name'] for x in dict_feats])

# Extract feature importance from rfmodel
featImportances = np.array(rfmodel.stages[-1].featureImportances)
columns = ['Year', 'Month', 'Day', 
       'Mean_Temp', 'Mean_Dewpoint', 
       'Mean_Sea_Level_Pressure',
       'Mean_Station_Pressure',
       'Mean_Visibility', 'Mean_Windspeed',
       'Max_Gust',
       'Precipitation',
       'Fog', 'Thunder','Tornado']

features = pd.DataFrame(featImportances, list_feats).reset_index()
plt.subplots(figsize=(20,14))
sns.barplot(x='index', y=0, data=features)
plt.xticks(rotation=45)
plt.title('Random Forest Feature Importances')
%matplot plt

Data Field	Description
ID	Unique ID of the weather station
Country_Code	Country Code of country the weather station is located
Latitude	Latitude value of the station location
Longitude	Latitude value of the station location
Year	Year the observation was taken
Month	Month the observation was taken
Day	Day the observation was taken
Mean_Temp	Mean temperature for the day in degrees Fahrenheit to tenths.
Mean_Dewpoint	Mean dew point for the day in degrees Fahrenheit to tenths.
Mean_Visibility	Mean visibility for the day in miles to tenths.
Mean_Windspeed	Mean wind speed for the day in knots to tenths.
Max_Windspeed	Maximum sustained wind speed reported for the day in knots to tenths.
Max_Temp	Maximum temperature reported during the day in Fahrenheit to tenths.
Min_Temp	Minimum temperature reported during the day in Fahrenheit to tenths.
Fog	Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day
Rain_or_Drizzle	Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day
Snow_or_Ice	Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day
Hail	Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day
Thunder	Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day
Tornado	Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day

Umaaraw, Umuulan: Predicting Rainfall Occurrences in the Philippines Using Apache Spark ¶

1 Data Description¶

2 Data Extraction¶

3 Data Preprocessing¶

Table 2. Sample Data¶

4 Exploratory Data Analysis¶

Table 3. Total Weather Stations in the Philippines¶

Table 4. Data Statistics¶

1 Baseline Accuracy¶

2 Modeling¶

3 Summary of Results¶

Table 5. Summary of ML Model Scores¶