import numpy as np
import pandas as pd
import sqlite3Creating a Database
First we create a weather database and populate it with three tables - temperatures, stations, and countries.
conn = sqlite3.connect("weather.db")
# Since there is no weather database the above command will create itWe use pandas library to read csv files and store those in the weather database.
df = pd.read_csv("../../../temps_stacked.csv")
df.to_sql("temperatures", conn, index=False)13992662
df = pd.read_csv("../../../countries.csv")
df.to_sql("countries", conn, index=False)279
df = pd.read_csv("../../../station-metadata.csv")
df.to_sql("stations", conn, index=False)27585
#closing database connection
conn.close()Writing a Query Function
Next we create a function, query_climate_database(), that retrieves some of the data from the weather database created above. This function accepts four arguments, the name of the country, the earliest and latest years and the month for which data should be returned. This function returns a Pandas dataframe with information about the station name, stations’ latitude and longitude, the name of the country where the station is located, the year in which the recording was made, the month of the temperature recording, and the average temperature. Instead of returning the entire dataset it chooses all the entries that have their year attribute lying between begin_year and year_end and the month and country name are also specified by the function. We will use this funciton later in order to create interesting visualization about temperature change reported by each station in a country.
def query_climate_database(country, year_begin, year_end, month):
"""
This function returns a pandas dataframe of temperature readings for the
specified country, in the specified date range, in the specified month. It
has four parameters:
country: a string, the name of the country for which we should retrun data
year_begin: an integer, the earliest year of the data that should be returned
year_end: an integer, the lateset year of the data that should be returned
month: an integer, the month for which the data should be returned
"""
conn = sqlite3.connect("weather.db")
cmd = f"""
SELECT S.NAME, S.LATITUDE, S.LONGITUDE,
C.Name Country, T.Year, T.Month, T.Temp
FROM temperatures T
LEFT JOIN stations S ON T.ID=S.ID
LEFT JOIN countries C on SUBSTRING(S.ID,1,2)=C.[FIPS 10-4]
WHERE Country="{country}" AND
(T.Year BETWEEN {year_begin} AND {year_end})
AND (T.Month={month})
"""
df = pd.read_sql(cmd, conn)
conn.close()
return dfFor example, if we run the function for the country India and pass the month and begin and end year as specified above, we will get a pandas dataframe showing only the data related to India in that timeline.
query_climate_database(country = "India",
year_begin = 1980,
year_end = 2020,
month = 1)| NAME | LATITUDE | LONGITUDE | Country | Year | Month | Temp | |
|---|---|---|---|---|---|---|---|
| 0 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1980 | 1 | 23.48 |
| 1 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1981 | 1 | 24.57 |
| 2 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1982 | 1 | 24.19 |
| 3 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1983 | 1 | 23.51 |
| 4 | PBO_ANANTAPUR | 14.583 | 77.633 | India | 1984 | 1 | 24.81 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 3147 | DARJEELING | 27.050 | 88.270 | India | 1983 | 1 | 5.10 |
| 3148 | DARJEELING | 27.050 | 88.270 | India | 1986 | 1 | 6.90 |
| 3149 | DARJEELING | 27.050 | 88.270 | India | 1994 | 1 | 8.10 |
| 3150 | DARJEELING | 27.050 | 88.270 | India | 1995 | 1 | 5.60 |
| 3151 | DARJEELING | 27.050 | 88.270 | India | 1997 | 1 | 5.70 |
3152 rows × 7 columns
Writing a Geographic Scatter Function for Yearly Temeperature Increases
Next we import an interactive graphic tool plotly. We will use this library to create temperature presentations on the real map. The user would be able to zoom in and explore the entire map. The other library is from sklearn that will be used to create a linear model and use it’s intercept to measure the average temperature change in a country for a specific month in a certain time period.
import plotly.express as px
# Interactive graphing library
from sklearn.linear_model import LinearRegressionNow we create a function called temperature_coefficient_plot(). This function returns an interactive geographic scatterplot where each dot on the map corresponds to a single station and by hovering on it the user can see its name. The color of the dot represents the amount of temperature increase reported by a station the value of which can be checked by the colorbar provided on the side of the plot.
def temperature_coefficient_plot(country, year_begin, year_end, month,
min_obs, **kwargs):
"""
The output of this function is an interactive geographic scatterplot,
constructed using Plotly Express, with a point for each station. The color
of the point reflects an estimate of the yearly change in temperature. There are
five explicit arguments and an undetermined number of keyword arguments:
country: a string, the name of the country for which we should retrun data
year_begin: an integer, the earliest year of the data that should be returned
year_end: an integer, the lateset year of the data that should be returned
month: an integer, the month for which the data should be returned
min_obs: an integer, the minimum number of years data for each station for the month
specified. If this criterion is not met, the row should be filtered out.
kwargs: these can be used to control the colormap used, or the mapbox style.
"""
df = query_climate_database(country, year_begin, year_end, month)
# we filter out the rows for which we have less data than the min_obs threshold
df["number_objects"] = df.groupby("NAME")['NAME'].transform('count')
df = df[df["number_objects"] >= min_obs]
# using linearRegression we find the average temeperature change reported by
# each station.
g = df.groupby("NAME")[["Year","Temp"]].apply(lambda x: np.round(LinearRegression().fit(
x['Year'].values.reshape(-1,1),x['Temp']).coef_[0],4))
# we change pandas series data type into dataframe, reset index, and rename the new column.
g = g.to_frame()
g = g.reset_index()
g = g.rename(columns={0:"Estimated Yearly Increase (C)"})
# we merge the two data frames.
result = pd.merge(df, g, how='left', on='NAME')
# months dictionary
months = {1: "January",
2: "February",
3: "March",
4: "April",
5: "May",
6: "June",
7: "July",
8: "August",
9: "September",
10: "October",
11: "November",
12: "December"}
# By using plotly.express, we make an interactive graph.
fig = px.scatter_mapbox(result,
lat='LATITUDE',
lon='LONGITUDE',
color="Estimated Yearly Increase (C)",
zoom=kwargs.pop('zoom',3),
height = 500,
color_continuous_midpoint=0,
hover_name="NAME",
mapbox_style=kwargs.pop('mapbox_style', "carto-positron"),
color_continuous_scale = kwargs.pop('color_continuous_scale', color_map),
title = f"Estimates of yearly increase in temperature in {months[month]} for staions in {str(country)}, years {str(year_begin)} - {str(year_end)}""")
return figWe run this function for India from 1980 to 2020 for the month of January. It only shows the stations that have at least 10 records in that time interval.
color_map = px.colors.diverging.RdGy_r # choose a colormap
fig = temperature_coefficient_plot("India", 1980, 2020, 1,
min_obs = 10,
zoom = 2,
mapbox_style="carto-positron",
color_continuous_scale=color_map)
fig.update_layout(margin={"l":0,"b":0})
fig.show()We run the temperature_coefficient_plot() for another country - China, keeping the other parameters constant.
color_map = px.colors.diverging.RdGy_r # choose a colormap
fig = temperature_coefficient_plot("China", 1980, 2020, 1,
min_obs = 10,
zoom = 2,
mapbox_style="carto-positron",
color_continuous_scale=color_map)
fig.update_layout(margin={"r":0,"l":0,"b":0})
fig.show()By running the temperature_coefficient_plot() for my home country - Armenia, keeping the other parameters constant, I get some insight about the temeperature increases.
color_map = px.colors.diverging.RdGy_r # choose a colormap
fig = temperature_coefficient_plot("Armenia", 1980, 2020, 1,
min_obs = 10,
zoom = 5,
mapbox_style="carto-positron",
color_continuous_scale=color_map)
fig.update_layout(margin={"r":0,"l":0,"b":0})
fig.show()Creating Two More Interesting Figures
We create another function - max_min_temeperature_per_country_year(), that returns a pandas dataframe of maximum and minimum temperature readings for each station of a country. It has only one parameter country. It uses sql query language to retrieve data for a country and find some aggregates.
def max_min_temeperature_per_country_year(country):
"""
This function returns a pandas dataframe of maximum and minimum temperature readings for the
specified country in each station in each year. It has only one parameter:
country: a string, the name of the country
"""
conn = sqlite3.connect("weather.db")
cmd = f"""
SELECT S.NAME, S.LATITUDE, S.LONGITUDE,
C.Name Country, T.Year, MIN(T.Temp) [Minimum Temperature],
MAX(T.Temp) [Maximum Temperature]
FROM temperatures T
LEFT JOIN stations S ON T.ID=S.ID
LEFT JOIN countries C on SUBSTRING(S.ID,1,2)=C.[FIPS 10-4]
WHERE country="{country}"
GROUP BY S.NAME
"""
df = pd.read_sql(cmd, conn)
conn.close()
return dfRuning this function for India, we get the following figure.
max_min_temeperature_per_country_year("India")| NAME | LATITUDE | LONGITUDE | Country | Year | Minimum Temperature | Maximum Temperature | |
|---|---|---|---|---|---|---|---|
| 0 | AGARTALA | 23.8830 | 91.2500 | India | 1999 | 16.14 | 31.20 |
| 1 | AGRA | 27.1667 | 78.0333 | India | 1948 | 9.97 | 37.45 |
| 2 | AHMADABAD | 23.0670 | 72.6330 | India | 2010 | 18.28 | 36.30 |
| 3 | AJMERE | 26.4700 | 74.6200 | India | 1949 | 11.85 | 35.44 |
| 4 | AKOLA | 20.7000 | 77.0670 | India | 1998 | 18.89 | 37.75 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 111 | UDAIPUR_DABOK | 24.6170 | 73.8830 | India | 2010 | 15.13 | 35.72 |
| 112 | VARANASI_BABATPUR | 25.4500 | 82.8670 | India | 2012 | 12.78 | 36.25 |
| 113 | VERAVAL | 20.9000 | 70.3670 | India | 2017 | 19.60 | 31.15 |
| 114 | VISHAKHAPATNAM | 17.7170 | 83.2330 | India | 2001 | 21.31 | 33.50 |
| 115 | VIZAGAPATAM | 17.7000 | 83.3700 | India | 1945 | 21.50 | 31.60 |
116 rows × 7 columns
In this part, I will create a visualizaions that address the following question: How the temperature varies in a narrow latitude range? In order to solve this problem I create a function called reports_for_latitude(). It creates a histograms visualization of average temperatures lying on a certain range of latitdue.
def reports_for_latitude(latitude_min, latitude_max, year, month, **kwargs):
"""
This function creates a histogram visualization of average temperatures that lie on the region provided
by latitude_min and latitude_max. The visualization is created with respect to longitude.
This function has four explicit arguments and an undetermined
number of keyword arguments:
latitude_min: a float, this is the smallest latitude range
latitude_max: a float, this is the biggest latitdue range
year: an integer, this is the year for which we want to create a visualization
month: an integer, this is the month for which we want to create a visualization
kwargs: these can be used to control the colormap used, or the mapbox style.
"""
conn = sqlite3.connect("weather.db")
cmd = f"""
SELECT S.NAME, S.LATITUDE, S.LONGITUDE,
C.Name Country, T.Year, T.Month, T.Temp
FROM temperatures T
LEFT JOIN stations S ON T.ID=S.ID
LEFT JOIN countries C on SUBSTRING(S.ID,1,2)=C.[FIPS 10-4]
WHERE (S.LATITUDE BETWEEN {latitude_min} AND {latitude_max}) AND
(T.Year={year}) AND
(T.Month={month})
ORDER BY S.LONGITUDE
"""
df = pd.read_sql(cmd, conn)
conn.close()
fig = px.histogram(df,
x="LONGITUDE",
nbins = kwargs.pop('nbins', 24),
title = f"The number of weather reports in each Longitude interval {latitude_min}-{latitude_max}")
return figBy running this function, we create a histogram histogram represtion of the average temperature in each longitude that lies on that latitdue range.
fig = reports_for_latitude(35, 60, 2020, 7)
fig.show()In this part, I will create a visualizaions that address the following question: How the average temperature varies in each country from month to month for a few consecutive years? These plots show yearly flactuations in temperature for each month.
def month_to_month(country, year_begin, year_end):
"""
This function creates a line visualization of temperature change from month to month in the specified
country. This function has two explicit arguments:
country: a string, the name of the country
year_begin: an integer, this is the smallest year for which we want to create a visualization
year_end: an integer, this is the biggest year for which we want to create a visualization
"""
conn = sqlite3.connect("weather.db")
cmd = f"""
SELECT C.Name Country, T.Year year, T.Month month, AVG(T.Temp) temperature
FROM temperatures T
LEFT JOIN stations S ON T.ID=S.ID
LEFT JOIN countries C on SUBSTRING(S.ID,1,2)=C.[FIPS 10-4]
WHERE Country="{country}" AND
(year BETWEEN {year_begin} AND {year_end})
GROUP BY month, year
ORDER BY month
"""
df = pd.read_sql(cmd, conn)
conn.close()
fig = px.line(df,
x="month",
y="temperature",
facet_col="year",
title=f"Average temperature change by month in {country}"
)
return figmonth_to_month(“Armenia”, 1999, 2003) returns a figure that has five subplots each showing monthly temperature chnage in Armenia from 1999 to 2003 including.
fig = month_to_month("Armenia", 1999, 2003)
fig.show()