Programming project¶

Preamble¶

  • Name: Luca Paus
  • Matrikelnummer: 12241609

01 - An energy balance model with hysteresis¶

Based on the code we wrote in week 07, code the following extension. For this exercise it's OK to copy-paste my solutions and start from there. Only copy the necessary (no useless code)!

The planetary albedo $\alpha$ is in fact changing with climate change. As the temperature drops, sea-ice and ice sheets are extending (increasing the albedo). Inversely, the albedo decreases as temperature rises. The planetary albedo of our simple energy balance model follows the following equation:

$$ \alpha = \begin{cases} 0.3,& \text{if } T \gt 280\\ 0.7,& \text{if } T \lt 250\\ a T + b, & \text{otherwise} \end{cases} $$

01-01: Compute the parameters $a$ and $b$ so that the equation is continuous at T=250K and T=280K.

Answer: from the requirements above we can deduce that $280a + b = 0.3$ and $250a + b = 0.7$. We can solve this as a system of equations:

1
2
3
4
import doctest
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
1
2
3
4
v1= np.array([[280,1],[250,1]])
v2= np.array([[0.3],[0.7]])
a, b = np.linalg.solve(v1, v2)    # solving the matrix equation
print(f"The parameter a is: {a}\nThe parameter b is: {b}")
The parameter a is: [-0.01333333]
The parameter b is: [4.03333333]

for $\alpha$ to be continous, the two parameters $a$ and $b$ equal: $a$ = -0.013333333333333338 and $b$ = 4.033333333333335

01-02: Now write a function called alpha_from_temperature which accepts a single positional parameter T as input (a scalar) and returns the corresponding albedo. Test your function using doctests to make sure that it complies to the instructions above.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def alpha_from_temperature(T):
    """
    Function computes albedo at the Tmeperature T
    T= Temperature
    
    >>> result= alpha_from_temperature(290)
    >>> np.isclose(result,0.3)
    True
    >>> result= alpha_from_temperature(240)
    >>> np.isclose(result,0.7)
    True
    >>> result= alpha_from_temperature(280)
    >>> np.isclose(result,0.3)
    True
    >>> result= alpha_from_temperature(270)
    >>> np.isclose(result,0.433333)
    True
    """
    if T<250:
        alpha= 0.7
    elif T> 280:
        alpha = 0.3
    else:
        alpha= float(a)* T + float(b)
    return alpha
doctest.testmod()
TestResults(failed=0, attempted=8)

01-03: Adapt the existing code from week 07 to write a function called temperature_change_with_hysteresis which accepts t0 (the starting temperature in K), n_years (the number of simulation years) as positional arguments and tau (the atmosphere transmissivity) as keyword argument (default value 0.611). Verify that:

  • the stabilization temperature with t0 = 292 and default tau is approximately 288K
  • the stabilization temperature with t0 = 265 and default tau is approximately 233K
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def asr(alpha=0.3):
    """Absorbed Solar Radiation (W m-2).

    Parameters
    ----------
    alpha : float, optional
        the planetary albedo

    Returns
    -------
    the asr (float)

    Examples
    --------
    >>> print(f'{asr():.2f}')
    238.35
    >>> print(f'{asr(alpha=0.32):.2f}')
    231.54
    """
    s0 = 1362
    return (1 - alpha) * s0 / 4

def olr(t, tau=0.611):
    """Outgoing Longwave Radiation (W m-2).

    Parameters
    ----------
    t : float
        the atmosphere temperature (K)
    tau : float, optional
        the atmosphere transmissivity (-)

    Returns
    -------
    the olr (float)

    Examples
    --------
    >>> print(f'{olr(288):.2f}')
    238.34
    >>> print(f'{olr(288, tau=0.63):.2f}')
    245.75
    """
    sigma = 5.67E-8
    return sigma * tau * t**4

def temperature_change_with_hysteresis(t0, n_years, tau=0.611):    
    C = 4.0e+08
    t_0 = 288
    dt = 60 * 60 * 24 * 365
    
    years = np.arange(n_years + 1)
    temperature = np.zeros(n_years + 1)
    temperature[0] = t0
    
    for i in range(n_years):
        temperature[i + 1] = temperature[i] + dt / C * (asr(alpha=alpha_from_temperature(temperature[i])) - olr(temperature[i], tau=tau))
    return years, temperature

# Testing
doctest.testmod()
TestResults(failed=0, attempted=12)

def asr(alpha=0.3): s = 1362 return (1 - alpha) * s / 4

def olr(t, tau=0.611): sigma = 5.67 * 10 ** (-8) return tau * sigma * t ** 4

def temperature_change_with_hysteresis(t0, n_years, tau=0.611): """Temperature change scenario after change of transmissivity.

Parameters
----------
t0 : float
    the starting temperature (K)
n_years : int
    the number of simulation years
tau : float, optional
    the atmosphere transmissivity (-)

Returns
-------
(time, temperature) : ndarrays of size n_years + 1

Examples
--------
>>> y_292, t_292 = temperature_change_with_hysteresis(292, 20)
>>> y_292
array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20.])
>>> from numpy.testing import assert_allclose
>>> assert_allclose(t_292[20], [288.01264452], atol=15e-3)
>>> y_265, t_265 = temperature_change_with_hysteresis(265, 100)
>>> y_265
array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,   8.,   9.,  10.,
        11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.,  21.,
        22.,  23.,  24.,  25.,  26.,  27.,  28.,  29.,  30.,  31.,  32.,
        33.,  34.,  35.,  36.,  37.,  38.,  39.,  40.,  41.,  42.,  43.,
        44.,  45.,  46.,  47.,  48.,  49.,  50.,  51.,  52.,  53.,  54.,
        55.,  56.,  57.,  58.,  59.,  60.,  61.,  62.,  63.,  64.,  65.,
        66.,  67.,  68.,  69.,  70.,  71.,  72.,  73.,  74.,  75.,  76.,
        77.,  78.,  79.,  80.,  81.,  82.,  83.,  84.,  85.,  86.,  87.,
        88.,  89.,  90.,  91.,  92.,  93.,  94.,  95.,  96.,  97.,  98.,
        99., 100.])
>>> from numpy.testing import assert_allclose
>>> assert_allclose(t_265[100], [233.0258187], atol=26e-3)
"""
time = np.linspace(0, n_years, n_years + 1)
temperature = np.zeros(n_years + 1)
temperature[0] = t0
dt = 60 * 60 * 24 * 365
C = 4.0e+08
for i in range(n_years):
    temperature[i+1] = temperature[i] + dt / C * (asr(alpha_from_tempreature(temperature[i]))-olr(temperature[i],tau))
return time, temperature

doctest.testmod() #note that I had to rearrange lots of whitespace characters in the docstring, for a different operating system, they might have to be arranged differently

01-04: Realize a total of N simulations with starting temperatures regularly spaced between t0=206K, and t0=318K and plot them on a single plot for n_years=50. The plot should look somewhat similar to this example for N=21.

1
2
3
4
5
6
7
8
9
N = 21 #arbitrary number
t0 = np.linspace(206,318,N)
fig, ax = plt.subplots()
for i in range(N):
    time, temperature = temperature_change_with_hysteresis(t0[i], 50, tau=0.611)
    ax.plot(time, temperature,color='black')
ax.set_title('Climate change scenario with hysteresis')
ax.set_xlabel('Years from start')
ax.set_ylabel('Temperature (K)')  ;      
No description has been provided for this image

Bonus: only if you want (and if time permits), you can try to increase N and add colors to your plot to create a graph similar to this one.

1
2
3
4
5
6
7
8
9
N = 150 #arbitrary number
t0 = np.linspace(206,318,N)
fig, ax = plt.subplots()
for i in range(N):
    time, temperature = temperature_change_with_hysteresis(t0[i], 50, tau=0.611)
    ax.plot(time, temperature,color=((i/N),(N-np.absolute(N/2-i))/N,(N-i)/N))
ax.set_title('Climate change scenario with hysteresis')
ax.set_xlabel('Years from start')
ax.set_ylabel('Temperature (K)');    
No description has been provided for this image

02 - Weather station data files¶

I downloaded 10 min data from the recently launched ZAMG data hub. The data file contains selected parameters from the "INNSBRUCK-FLUGPLATZ (ID: 11804)" weather station.

You can download the data from the following links (right-click + "Save as..."):

  • station data
  • parameter metadata
  • station list from the ZAMG (in a better format than last time)

Let me open the data for you and display its content:

1
2
df = pd.read_csv('INNSBRUCK-FLUGPLATZ_Datensatz_20150101_20211231.csv', index_col=1, parse_dates=True)
df = df.drop('station', axis=1)

02-01: after reading the documentation of the respective functions (and maybe try a few things yourself), explain in plain sentences:

  • what am I asking pandas to do with the index_col=1, parse_dates=True keyword arguments? Why am I doing this?
  • what am I asking pandas to do with .drop()? Why axis=1?

What does the index_col=1 keyword argument do when passed to pandas? Why is it used?

The index_col=1 keyword argument in pandas is used to specify a specific column in the dataset to be used as the indexing column for the DataFrame (df). Instead of reading the selected column as a regular column of data in df, it is used to index the DataFrame. In this particular example, the value 1 corresponds to the column labeled 'time'. By setting index_col=1, the DataFrame df is indexed using the values contained in the 'time' column, which makes the DataFrame appear chronological to the programmer.

What does the parse_dates=True keyword argument do when passed to pandas?

The parse_dates=True keyword argument in pandas is used to indicate that the index should be "parsed." By default, the value of parse_dates is False. However, when set to True, pandas will attempt to convert the index into a datetime-like object. In this example, by examining the type of the index before and after using this keyword argument, one can observe that the type changes from 'pandas.core.indexes.range.RangeIndex' to 'pandas.core.indexes.range.DatetimeIndex'. Although the printed output of df may look the same, the type of the index has been changed. This conversion allows for more efficient handling and manipulation of dates in subsequent operations.

What does the .drop() function do when called on df? Why is axis=1 used?

The .drop() function is used to remove specified columns or rows from a DataFrame (df). In this case, the argument passed to .drop() is 'station', indicating that the column labeled 'station' should be removed from df. The axis keyword argument is used to specify whether indices (rows) or columns should be dropped. By setting axis=1, the function removes the column specified by the label ('station') from df. The default value for axis is 0, which would correspond to dropping rows.

Now let me do something else from you:

1
2
dfmeta = pd.read_csv('ZEHNMIN Parameter-Metadaten.csv', index_col=0)
dfmeta.loc[df.columns]
Kurzbeschreibung Beschreibung Einheit
DD Windrichtung Windrichtung, vektorielles Mittel über 10 Minuten °
FF vektorielle Windgeschwindigkeit Windgeschwindigkeit, vektorielles Mittel über ... m/s
GSX Globalstrahlung Globalstrahlung, arithmetisches Mittel über 10... W/m²
P Luftdruck Luftdruck, Basiswert zur Minute10 hPa
RF Relative Feuchte Relative Luftfeuchte, Basiswert zur Minute10 %
RR Niederschlag 10 Minuten Summe des Niederschlags, Summe der ... mm
SO Sonnenscheindauer Sonnenscheindauer, Sekundensumme über 10 Minuten s
TB1 Erdbodentemperatur in 10cm Tiefe Erdbodentemperatur in 10cm Tiefe, Basiswert zu... °C
TB2 Erdbodentemperatur in 20cm Tiefe Erdbodentemperatur in 20cm Tiefe, Basiswert zu... °C
TB3 Erdbodentemperatur in 50cm Tiefe Erdbodentemperatur in 50cm Tiefe, Basiswert zu... °C
TL Lufttemperatur in 2m Lufttemperatur in 2m Höhe, Basiswert zur Minute10 °C
TP Taupunkt Taupunktstemperatur, Basiswert zur Minute10 °C

02-02: again, explain in plain sentences what the dfmeta.loc[df.columns] is doing, and why it works that way.

The expression dfmeta.loc[df.columns] is used to access specific rows from the DataFrame dfmeta based on the column labels present in another DataFrame called df. First, df.columns returns an Index object containing all the column labels in the DataFrame df. Next, the .loc[] function is used on dfmeta with df.columns as the argument. This means that we are selecting rows from dfmeta based on the column labels found in df. In simpler terms, the code is creating a new DataFrame that displays only the rows from dfmeta where the index labels match the column labels in df. It allows us to filter the metadata in dfmeta based on the columns present in df.

Finally, let me do a last step for you before you start coding:

1
dfh = df.resample('H').mean()

02-03: explore the dfh dataframe. Explain, in plain words, what the purpose of .resample('H') followed by mean() is. Explain what .resample('H').max() and .resample('H').sum() would do.

When we explore the dfh DataFrame, we can observe that its columns are shorter compared to the original df DataFrame. Additionally, the indices in dfh only display hourly data. By comparing df.index and dfh.index, we can see that the frequency value of the Datetime index has changed from 'None' to 'H', confirming that the data is now represented at an hourly level.

The .resample('H') method is used to resample the time series data in df at an hourly frequency. By providing the argument 'H', it indicates that we want to group the data into hourly intervals. Since no additional arguments are specified, the default value for the 'label' parameter is 'left', which means that the new indices in dfh represent the start of each hour.

The .mean() method is then applied to compute the new values in dfh. It calculates the mean of all the original values that fall within each hourly interval and assigns this average value to the corresponding index in dfh. As a result, dfh displays the hourly averaged data computed from the original 10-minute data.

If .max() or .sum() were appended after .resample('H'), it would yield different results.

Using .resample('H').max() would compute the maximum value within each hourly interval. This means that for every hour, the highest value from the original data points within that hour would be selected and assigned to the corresponding index in the dfh DataFrame.

On the other hand, .resample('H').sum() would calculate the sum of all the values within each hourly interval. It would add up all the original data points falling within an hour and assign the resulting sum as the value for the corresponding index in dfh.

02-04: Using np.allclose, make sure that the average of the first hour (that you'll compute yourself from df) is indeed equal to the first row of dfh. Now, two variables in the dataframe have units that aren't suitable for averaging. Please convert the following variables to the correct units:

  • RR needs to be converted from the average of 10 min sums to mm/h
  • SO needs to be converted from the average of 10 min sums to s/h
1
2
first_hour_avg = np.array([np.mean(df[label].values[0:6]) for label in df.columns])
np.allclose(first_hour_avg, dfh.iloc[0].values)
True
1
2
dfh['RR'] = dfh['RR']*6
dfh['SO'] = dfh['SO']*6

From now on, we will use the hourly data only (and further aggregations when necessary). The 10 mins data are great but require a little bit more of pandas kung fu (the chinese term, not the sport) to be used efficiently.

Spend some time exploring the dfh dataframe we just created. What time period does it cover? What variables does it contain?

Note on pandas: all the exercises below can be done with or without pandas. Each question can be answered with very few lines of code (often one or two) with pandas, and I recommend to use it as much as possible. If you want, you can always use numpy in case of doubt: you can access the data as a numpy array with: df[column_name].values.

03 - Precipitation¶

In this section, we will focus on precipitation only.

03-01: Compute the average annual precipitation (m/year) over the 7-year period.

1
2
3
dfy = df['RR'].resample('Y',label='left').sum()
avg = np.mean(dfy)
print(f'The average annual precipitation is {np.round(avg,2)} (m/year) over the course of the 7 year period.')
The average annual precipitation is 920.56 (m/year) over the course of the 7 year period.

03-02: What is the smallest non-zero precipitation measured at the station? What is the maximum hourly precipitation measured at the station? When did this occur?

1
2
3
4
5
6
7
min_nzero = dfh.loc[dfh['RR'] > 0, 'RR'].min()
index_min = dfh.loc[dfh['RR'] == min_nzero].index[0]
max_pre = dfh['RR'].max()
index_max_pre = dfh.loc[dfh['RR'] == max_pre].index[0]

print(f"The smallest measured non-zero precipitation is {min_nzero} mm/h and occurred on {index_min}.")
print(f"The maximum measured precipitation is {max_pre} mm/h and occurred on {index_max_pre}.")
The smallest measured non-zero precipitation is 0.1 mm/h and occurred on 2015-01-02 16:00:00.
The maximum measured precipitation is 22.2 mm/h and occurred on 2021-09-16 22:00:00.

03-03: Plot a histogram of hourly precipitation, with bins of size 0.2 mm/h, starting at 0.1 mm/h and ending at 25 mm/h. Plot the same data, but this time with a logarithmic y-axis. Compute the 99th percentile (or quantile) of hourly precipitation.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
# Plot histogram of hourly precipitation (RR) values greater than 0
dfh['RR'][dfh['RR']>0].plot.hist(bins=round((25-0.1)/0.2))
plt.xlabel('Hourly precipitation (mm/h)')
plt.title('Histogram of hourly precipitation from 2015 to 2021')
plt.show()

# Plot histogram of hourly precipitation (RR) values greater than 0 with logarithmic y-axis
dfh['RR'][dfh['RR']>0].plot.hist(bins=round((25-0.1)/0.2), logy=True)
plt.title('Histogram of hourly precipitation from 2015 to 2021')
plt.xlabel('Hourly precipitation (mm/h)')
plt.show()

# Calculate the 99th percentile of hourly precipitation (RR) over the years 2015 to 2021
a = np.sort(dfh['RR'].values)
perc99 = a[np.int64(np.round(0.99*a.size))]
print(f'The 99th percentile of hourly precipitation from 2015 to 2021 is {perc99} mm/h')
No description has been provided for this image
No description has been provided for this image
The 99th percentile of hourly precipitation from 2015 to 2021 is 2.4 mm/h

03-04: Compute daily sums (mm/d) of precipitation (tip: use .resample again). Compute the average number or rain days per year in Innsbruck (a "rain day" is a day with at least 0.1 mm / d of measured precipitation).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
dfd = pd.DataFrame()

# Resample the 'RR' column to calculate the daily sum
dfd['RR'] = df['RR'].resample('D', label='left').sum()

# Create a new column 'rainday' based on whether 'RR' is greater than 0
dfd['rainday'] = dfd['RR'] > 0

# Resample the 'rainday' column to calculate the yearly sum and calculate the average
avg_rain_days = np.mean(dfd['rainday'].resample('Y').sum().values)

print(f"The average number of rainy days per year from 2015 to 2021 was {round(avg_rain_days)}.")
The average number of rainy days per year from 2015 to 2021 was 170.

03-05: Now select (subset) the daily dataframe to keep only only daily data in the months of December, January, February (DFJ). To do this, note that dfh.index.month exists and can be used to subset the data efficiently. Compute the average precipitation in DJF (mm / d), and the average number of rainy days in DJF. Repeat with the months of June, July, August (JJA).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
dfd = pd.DataFrame()

# Resample the 'RR' column to calculate the daily sum
dfd['RR'] = df['RR'].resample('D', label='left').sum()

# Create a new column 'rainday' based on whether 'RR' is greater than 0
dfd['rainday'] = dfd['RR'] > 0

# Resample the 'rainday' column to calculate the yearly sum and calculate the average
avg_rain_days = np.mean(dfd['rainday'].resample('Y').sum().values)

print(f"The average number of rainy days per year from 2015 to 2021 was {round(avg_rain_days)}.")
The average number of rainy days per year from 2015 to 2021 was 170.

03-06: Repeat the DJF and JJA subsetting, but this time with hourly data. Count the total number of times that hourly precipitation in DJF is above the 99th percentile computed in exercise 03-03. Repeat with JJA.

1
2
3
4
5
6
7
dfh_DJF = dfh[dfh.index.month.isin([12, 1, 2])]
above99_DJF = np.sum(dfh_DJF['RR'] > perc99)

dfh_JJA = dfh[dfh.index.month.isin([6, 7, 8])]
above99_JJA = np.sum(dfh_JJA['RR'] > perc99)

print(f"From 2015 to 2021, the hourly precipitation was {above99_DJF} times above the 99th percentile (over the entire 7 years) during December, January, February, and {above99_JJA} times during June, July, August.")
From 2015 to 2021, the hourly precipitation was 58 times above the 99th percentile (over the entire 7 years) during December, January, February, and 298 times during June, July, August.

03-07: Compute and plot the average daily cycle of hourly precipitation in DFJ and JJA. I expect a plot similar to this example. To compute the daily cycle, I recommend to combine two very useful tools. First, start by noticing that ds.index.hour exists and can be used to categorize data. Then, note that df.groupby exists and can be used exactly for that (documentation).

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
hours = np.arange(0, 24)

hourly_prec_avg_DJF = np.array([np.mean(dfh_DJF['RR'][dfh_DJF.index.hour == i]) for i in hours])
hourly_prec_avg_JJA = np.array([np.mean(dfh_JJA['RR'][dfh_JJA.index.hour == i]) for i in hours])

fig, ax = plt.subplots()
ax.plot(hours, hourly_prec_avg_DJF, label='DJF')
ax.plot(hours, hourly_prec_avg_JJA, label='JJA')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Hourly precipitation (mm/h)')
ax.legend(loc='upper right')
ax.set_title('Precipitation Daily Cycle in Innsbruck (2015-2021)')
plt.show()
No description has been provided for this image

04 - A few other variables¶

In this section, we will continue to analyze the weather station data.

04-01: Verify that the three soil temperatures have approximately the same average value over the entire period. Now plot the three soil temperature timeseries in hourly resolution over the course of the year of 2020 (example). Repeat the plot with the month of may 2020.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
# Check if the means of 'TB1', 'TB2', and 'TB3' are close to each other
mean_check = np.allclose(dfh['TB1'].mean(), dfh['TB2'].mean(), dfh['TB3'].mean())

# Filter the dfh DataFrame for the year 2020
dfh_2020 = dfh[dfh.index.year == 2020]

# Create a new DataFrame, dfh_2020_TB, with columns 'TB1', 'TB2', and 'TB3' for the year 2020
dfh_2020_TB = pd.DataFrame({'TB1': dfh_2020['TB1'], 'TB2': dfh_2020['TB2'], 'TB3': dfh_2020['TB3']})

# Plot the soil temperature for the year 2020
dfh_2020_TB.plot(ylabel='Soil Temperature (°C)', title='Soil Temperature in Innsbruck - 2020')

# Filter the dfh_2020 DataFrame for the month of May
dfh_2020_05 = dfh_2020[dfh_2020.index.month == 5]

# Create a new DataFrame, dfh_2020_05_TB, with columns 'TB1', 'TB2', and 'TB3' for May 2020
dfh_2020_05_TB = pd.DataFrame({'TB1': dfh_2020_05['TB1'], 'TB2': dfh_2020_05['TB2'], 'TB3': dfh_2020_05['TB3']})

# Plot the soil temperature for May 2020
dfh_2020_05_TB.plot(ylabel='Soil Temperature (°C)', title='Soil Temperature in Innsbruck - May 2020')
<Axes: title={'center': 'Soil Temperature in Innsbruck - May 2020'}, xlabel='time', ylabel='Soil Temperature (°C)'>
No description has been provided for this image
No description has been provided for this image

04-02: Plot the average daily cycle of all three soil temperatures.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
hours = np.arange(0, 24)

hourly_avg_TB1 = np.array([np.mean(dfh['TB1'][dfh.index.hour == i]) for i in hours])
hourly_avg_TB2 = np.array([np.mean(dfh['TB2'][dfh.index.hour == i]) for i in hours])
hourly_avg_TB3 = np.array([np.mean(dfh['TB3'][dfh.index.hour == i]) for i in hours])

fig, ax = plt.subplots()
ax.plot(hours, hourly_avg_TB1, label='TB1')
ax.plot(hours, hourly_avg_TB2, label='TB2')
ax.plot(hours, hourly_avg_TB3, label='TB3')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Soil Temperature (°C)')
ax.set_title('Average Daily Cycle of Soil Temperatures (2015-2021)')
ax.legend()
plt.show()
No description has been provided for this image

04-03: Compute the difference (in °C) between the air temperature and the dewpoint temperature. Now plot this difference on a scatter plot (x-axis: relative humidity, y-axis: temperature difference).

1
2
3
4
5
6
7
8
9
dfh['TDIFF'] = dfh['TL'] - dfh['TP']

fig, ax = plt.subplots(figsize=(8, 5))
ax.scatter(dfh['RF'], dfh['TDIFF'])
ax.set_xlabel('Relative Humidity')
ax.set_ylabel('Temperature Difference (TL - TP) in °C')
ax.set_title('Air and Dewpoint Temperature Difference Dependent on Relative Humidity')

plt.show()
No description has been provided for this image

05 - Free coding project¶

The last part of this semester project is up to you! You are free to explore whatever interests you. I however add three requirements:

  1. This section should have at least 5 original plots in it. They are the output of your analysis.
  2. This section should also use additional data that you downloaded yourself. The easiest way would probably be to download another station(s) from the ZAMG database, or data from the same station but for another time period (e.g. for trend or change analysis). You can, however, decide to do something completely different if you prefer (as long as you download and read one more file).
  3. this section should contain at least one regression or correlation analysis between two parameters. Examples:
    • between two different variables at the same station (like we did with the dewpoint above)
    • between different stations (for example, average temperature as a function of station elevation)
    • between average temperature and time (trends analysis)
    • etc.

That's it! Here are a few ideas:

  • detection of trends and changes at the station Innsbruck for 1993-2021
  • comparison of 5-yr climatologies at various stations in Tirol, taking elevation or location into account
  • compute the theoretical day length from the station's longitude and latitude (you can find solutions for this online, just let me know the source if you used a solution online), and use these computations to compare the measured sunshine duration to the maximum day length. This can be used to classify "sunshine days" for example.
  • use the python "windrose" library to plot a windrose at different locations and time of day.
  • etc.

If you have your own idea but are unsure about whether this is too much or not enough, come to see me in class! In general, the three requirements above should be enough.

My goal with this section is to let you formulate a programming goal and implement it.

Ideas: My Mum works as a gardener at home, so we have big garde at home where the weather is a very relevant to the amount she can harvest. My Dad build a litte Weathestation on top of our greenhouse,unfortunaly the are at vacation, so i cannot use use my dads data or compare it to the nearest weather station. Also i yet do not know how to properly use the puplic german weather data by the DWD. So i decided to Analyse the data from the station: HAHNENKAMM-EHRENBACHHOEHE. This station is the closest one to my friends ski hut. Due to strong wind often the lifts cannot run, thats why i am going to analyse the wind at the station HAHNENKAMM-EHRENBACHHOEHE.

For this I downloaded 10-minute data from the ZAMG Datahub. For the station 'HAHNENKAMM-EHRENBACHHOEHE' I am observing wind direction, wind speed, maximum wind speed and air temperature from 2020/01/01 to 2023/05/31:

1
2
3
dfd_HE = pd.read_csv('HAHNENKAMM-EHRENBACHHOEHE_20200101T0000_20230531T0000.csv', index_col=1, parse_dates=True)
dfd_HE = dfd_HE.drop('time', axis=1)
#dfd_HE.columns

05

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
def check_wind_dir(direction):
    direction = np.asanyarray(direction)
    if np.any(direction < -360):
        raise ValueError("Invalid wind direction value")
    return np.fmod(direction + 360, 360)

def sd_to_uv(speed, direction):
    direction = check_wind_dir(270 - np.array(direction))  # Adjusted calculation for wind direction
    direction_rad = np.deg2rad(direction)
    v = np.sin(direction_rad) * speed
    u = np.cos(direction_rad) * speed
    return u, v

def uv_to_sd(u, v):
    speed = np.sqrt(np.square(np.array(u)) + np.square(np.array(v)))
    direction = np.zeros_like(speed)
    positive_v = v > 0
    negative_v = ~positive_v
    direction[positive_v] = check_wind_dir(270 - np.rad2deg(np.arccos(u[positive_v] / speed[positive_v])))
    direction[negative_v] = check_wind_dir(270 + np.rad2deg(np.arccos(u[negative_v] / speed[negative_v])))
    return speed, direction

dfd_HE['u_wind'], dfd_HE['v_wind'] = sd_to_uv(dfd_HE['FF'], dfd_HE['DD'])
1
dfd_HE.describe()
DD DDX DDX_FLAG DD_FLAG FF FFAM FFAM_FLAG FFX FFX_FLAG FF_FLAG ... TS TSMAX TSMAX_FLAG TSMIN TSMIN_FLAG TS_FLAG ZEITX ZEITX_FLAG u_wind v_wind
count 180432.000000 180432.000000 180433.000000 180433.000000 180433.000000 180433.000000 180433.000000 180432.000000 180433.000000 180433.000000 ... 0.0 0.0 180433.0 0.0 180433.0 180433.0 180402.000000 180433.000000 180432.000000 180432.000000
mean 209.004689 208.217666 100.059867 100.059867 2.205998 2.387245 100.061519 4.341719 100.077048 100.064290 ... NaN NaN 1.0 NaN 1.0 1.0 29.127892 100.004711 0.852420 0.118940
std 96.897075 96.301152 3.068559 3.068559 1.355867 1.370480 3.095855 2.705785 3.336435 3.140236 ... NaN NaN 0.0 NaN 0.0 0.0 17.352033 2.270425 1.880038 1.558702
min 0.000000 1.000000 2.000000 2.000000 0.000000 0.100000 100.000000 0.200000 2.000000 100.000000 ... NaN NaN 1.0 NaN 1.0 1.0 0.000000 2.000000 -15.316602 -6.824958
25% 133.000000 133.000000 100.000000 100.000000 1.300000 1.400000 100.000000 2.500000 100.000000 100.000000 ... NaN NaN 1.0 NaN 1.0 1.0 14.000000 100.000000 -0.375877 -0.674292
50% 249.000000 246.000000 100.000000 100.000000 2.000000 2.100000 100.000000 3.700000 100.000000 100.000000 ... NaN NaN 1.0 NaN 1.0 1.0 29.000000 100.000000 0.999391 0.000000
75% 277.000000 275.000000 100.000000 100.000000 2.800000 3.000000 100.000000 5.400000 100.000000 100.000000 ... NaN NaN 1.0 NaN 1.0 1.0 44.000000 100.000000 2.000000 0.771784
max 360.000000 360.000000 300.000000 300.000000 18.000000 18.600000 300.000000 37.000000 300.000000 300.000000 ... NaN NaN 1.0 NaN 1.0 1.0 59.000000 400.000000 11.301332 17.213486

8 rows × 54 columns

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
# Resample the DataFrame to daily frequency and compute the mean
dfd_HE_resampled = dfd_HE.resample('D', label='left').mean()

# Remove the last row
dfd_HE_resampled = dfd_HE_resampled.iloc[:-1]

# Compute the FF_from_uv and DD columns
dfd_HE_resampled['FF_from_uv'], dfd_HE_resampled['DD'] = uv_to_sd(dfd_HE_resampled['u_wind'], dfd_HE_resampled['v_wind'])

# Create the 'windday' column based on the FF column
dfd_HE_resampled['windday'] = dfd_HE_resampled['FF'] > 1

# Resample 'windday' column to compute yearly windy days
yearly_windy_days = dfd_HE_resampled['windday'].resample('Y').sum()

# Plot the yearly windy days
yearly_windy_days.plot(title='Average number of wind days per year from 2020 to 2023', ylabel='Amount of wind days')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[104], line 17
     14 yearly_windy_days = dfd_HE_resampled['windday'].resample('Y').sum()
     16 # Plot the yearly windy days
---> 17 yearly_windy_days.plot(title='Average number of wind days per year from 2020 to 2023', ylabel='Amount of wind days')

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\plotting\_core.py:975, in PlotAccessor.__call__(self, *args, **kwargs)
    972             label_name = label_kw or data.columns
    973             data.columns = label_name
--> 975 return plot_backend.plot(data, kind=kind, **kwargs)

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\plotting\_matplotlib\__init__.py:71, in plot(data, kind, **kwargs)
     69         kwargs["ax"] = getattr(ax, "left_ax", ax)
     70 plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 71 plot_obj.generate()
     72 plot_obj.draw()
     73 return plot_obj.result

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\plotting\_matplotlib\core.py:446, in MPLPlot.generate(self)
    444 def generate(self) -> None:
    445     self._args_adjust()
--> 446     self._compute_plot_data()
    447     self._setup_subplots()
    448     self._make_plot()

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\plotting\_matplotlib\core.py:632, in MPLPlot._compute_plot_data(self)
    630 # no non-numeric frames or series allowed
    631 if is_empty:
--> 632     raise TypeError("no numeric data to plot")
    634 self.data = numeric_data.apply(self._convert_to_ndarray)

TypeError: no numeric data to plot
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
months = np.arange(1, 13)
month_dict = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
              7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}

# Resample 'windday' column to compute monthly windy days
monthly_windy_days = dfd_HE['windday'].resample('M').sum()

# Compute the average number of windy days per month
monthly_windy_days_avg = np.zeros(months.size)
for i in months:
    monthly_windy_days_avg[i - 1] = monthly_windy_days[monthly_windy_days.index.month == i].mean()

# Plot the average yearly cycle of wind days per month
plt.plot(months, monthly_windy_days_avg)
plt.xticks(months, ['J', 'F', 'M', 'A', 'M', 'J', 'J', 'A', 'S', 'O', 'N', 'D'])
plt.xlabel('Months')
plt.ylabel('Amount of Wind Days')
plt.title('Average Yearly Cycle of Wind Days per Month')
plt.show()

# Find the month with the most wind days and the month with the least wind days
max_month = month_dict[monthly_windy_days_avg.argmax() + 1]
max_value = monthly_windy_days_avg.max()
min_month = month_dict[monthly_windy_days_avg.argmin() + 1]
min_value = monthly_windy_days_avg.min()

print(f"Month with the most wind days: {max_month}, number: {max_value}")
print(f"Month with the least wind days: {min_month}, number: {min_value}")
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
File ~\mambaforge\envs\inpro\lib\site-packages\pandas\core\indexes\base.py:3652, in Index.get_loc(self, key)
   3651 try:
-> 3652     return self._engine.get_loc(casted_key)
   3653 except KeyError as err:

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\_libs\index.pyx:147, in pandas._libs.index.IndexEngine.get_loc()

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\_libs\index.pyx:176, in pandas._libs.index.IndexEngine.get_loc()

File pandas\_libs\hashtable_class_helper.pxi:7080, in pandas._libs.hashtable.PyObjectHashTable.get_item()

File pandas\_libs\hashtable_class_helper.pxi:7088, in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'windday'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[107], line 6
      2 month_dict = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
      3               7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
      5 # Resample 'windday' column to compute monthly windy days
----> 6 monthly_windy_days = dfd_HE['windday'].resample('M').sum()
      8 # Compute the average number of windy days per month
      9 monthly_windy_days_avg = np.zeros(months.size)

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\core\frame.py:3761, in DataFrame.__getitem__(self, key)
   3759 if self.columns.nlevels > 1:
   3760     return self._getitem_multilevel(key)
-> 3761 indexer = self.columns.get_loc(key)
   3762 if is_integer(indexer):
   3763     indexer = [indexer]

File ~\mambaforge\envs\inpro\lib\site-packages\pandas\core\indexes\base.py:3654, in Index.get_loc(self, key)
   3652     return self._engine.get_loc(casted_key)
   3653 except KeyError as err:
-> 3654     raise KeyError(key) from err
   3655 except TypeError:
   3656     # If we have a listlike key, _check_indexing_error will raise
   3657     #  InvalidIndexError. Otherwise we fall through and re-raise
   3658     #  the TypeError.
   3659     self._check_indexing_error(key)

KeyError: 'windday'
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from windrose import WindroseAxes
import matplotlib.gridspec as gridspec
import matplotlib.cm as cm

dfd_HE = dfd_HE.resample('M', label='left').mean()
dfd_HE = dfm_gmuend.iloc[:-1]
dfd_HE['FF_from_uv'], dfd_HE['DD'] = uv_to_sd(dfd_HE['u_wind'], dfd_HE['v_wind'])

fig = plt.figure(figsize=(20, 10))
heights = [10, 4, 10, 4, 10, 4, 10]
gs = gridspec.GridSpec(7, 3, height_ratios=heights)
gp = gs.get_grid_positions(fig)

def wind_plot(x, y, title, fig, rect):
    ax = WindroseAxes(fig, rect)
    fig.add_axes(ax)
    ax.bar(x, y, opening=0.8, edgecolor='white', bins=[0, 0.25, 0.5, 1, 2, 5, 10], cmap=cm.hot)
    ax.set_title(title, position=(0.5, 1.1))

for i in range(4):
    for j in range(3):
        rect = [gp[2][j], gp[0][2*i], gp[3][j] - gp[2][j], gp[1][2*i] - gp[0][2*i]]  # [left, bottom, width, height]
        month_index = 3 * i + j + 1
        wind_plot(dfd_HE['DD'][dfd_HE.index.month == month_index],
                  dfd_HE['FF'][dfd_HE.index.month == month_index],
                  month_dict[month_index], fig, rect)

plt.show()
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[108], line 1
----> 1 from windrose import WindroseAxes
      2 import matplotlib.gridspec as gridspec
      3 import matplotlib.cm as cm

ModuleNotFoundError: No module named 'windrose'
1