import pandas as pd
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

# We first read the data from the file.  The following code sets lines to be a list 
# of strings, one for each line in the file.
data_dir = '../data/misc'
with open(f'{data_dir}/co2_daily_mlo.txt') as f:
    lines = f.readlines()

# We now discard the comment lines, and parse each of the remaining lines into a list of numbers.
records = []
for i, line in enumerate(lines):
    if line.startswith('#'):
        continue
    fields = line.split()
    x = [int(fields[0]), int(fields[1]), int(fields[2]), float(fields[3]), float(fields[4])]
    records.append(x)

# We now convert the list of records into a pandas DataFrame.
# If it were not for the comment lines, we could have used pandas.read_fwf() instead
df = pd.DataFrame.from_records(records, columns=['year', 'month', 'day', 'decimal_date', 'co2'])

display(df.info())
display(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15602 entries, 0 to 15601
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          15602 non-null  int64  
 1   month         15602 non-null  int64  
 2   day           15602 non-null  int64  
 3   decimal_date  15602 non-null  float64
 4   co2           15602 non-null  float64
dtypes: float64(2), int64(3)
memory usage: 609.6 KB

None

df.plot(x='decimal_date', y='co2', title='$CO_2$ concentration at Mauna Loa Observatory')

<Axes: title={'center': '$CO_2$ concentration at Mauna Loa Observatory'}, xlabel='decimal_date'>

def p(x, a0, a1, a2, b0, b1):
    return a0 + a1 * x + a2 * x * x + b0 * np.cos(2*np.pi*x) + b1 * np.sin(2*np.pi*x)

ab = curve_fit(p, df['decimal_date'], df['co2'], p0=[-3650,2,0,0,0])[0]

print(ab)

df['co2_pred'] = p(df['decimal_date'], *ab)

[ 5.43617180e+04 -5.58536308e+01  1.44287252e-02 -9.10445762e-01
  2.81147996e+00]

df.plot(x='decimal_date', y=['co2', 'co2_pred'], title='CO2 concentration at Mauna Loa Observatory')
plt.legend(['Measured $CO_2$ concentration', 'Modelled $CO_2$ concentration'])

<matplotlib.legend.Legend at 0x2d5fe13f110>

	year	month	day	decimal_date	co2
count	15602.000000	15602.000000	15602.000000	15602.000000	15602.000000
mean	2000.246443	6.469491	15.760095	2000.740737	373.732559
std	14.361954	3.494095	8.799955	14.359869	27.046583
min	1974.000000	1.000000	1.000000	1974.378100	326.060000
25%	1989.000000	3.000000	8.000000	1989.082875	352.190000
50%	2000.000000	6.000000	16.000000	2000.952200	370.660000
75%	2013.000000	10.000000	23.000000	2013.081525	395.567500
max	2025.000000	12.000000	31.000000	2025.246600	430.600000

Mauna Loa carbon dioxide concentration¶