"""Functions for loading real-world time series datasets.
This module provides convenience functions for loading commonly used
time series datasets in the format expected by vangja (columns: ds, y).
"""
from __future__ import annotations
import tempfile
from pathlib import Path
from typing import Literal
from urllib.request import urlopen
import pandas as pd
[docs]
def load_air_passengers() -> pd.DataFrame:
"""Load the Air Passengers dataset.
The Air Passengers dataset is a classic time series dataset containing
monthly totals of international airline passengers from January 1949 to
December 1960 (144 observations).
This dataset exhibits:
- Clear upward trend
- Strong yearly seasonality
- Multiplicative seasonality (variance increases with level)
Returns
-------
pd.DataFrame
DataFrame with columns:
- `ds`: datetime, monthly timestamps from 1949-01 to 1960-12
- `y`: float, number of passengers (in thousands)
Examples
--------
>>> from vangja.datasets import load_air_passengers
>>> df = load_air_passengers()
>>> print(f"Shape: {df.shape}")
Shape: (144, 2)
>>> print(f"Date range: {df['ds'].min()} to {df['ds'].max()}")
Date range: 1949-01-01 to 1960-12-01
Notes
-----
Data is downloaded from the Prophet examples repository on GitHub.
Original source: Box, G. E. P., Jenkins, G. M. and Reinsel, G. C. (1976)
Time Series Analysis, Forecasting and Control. Third Edition.
"""
url = "https://raw.githubusercontent.com/facebook/prophet/main/examples/example_air_passengers.csv"
df = pd.read_csv(url)
df["ds"] = pd.to_datetime(df["ds"])
return df
[docs]
def load_peyton_manning() -> pd.DataFrame:
"""Load the Peyton Manning Wikipedia page views dataset.
This dataset contains daily log-transformed Wikipedia page views for
Peyton Manning from December 2007 to January 2016 (2905 observations).
This dataset exhibits:
- Multiple trend changes (career events)
- Strong yearly seasonality (NFL season)
- Weekly seasonality (game days)
- Holiday effects (Super Bowl, playoffs)
Returns
-------
pd.DataFrame
DataFrame with columns:
- `ds`: datetime, daily timestamps from 2007-12-10 to 2016-01-20
- `y`: float, log-transformed page views
Examples
--------
>>> from vangja.datasets import load_peyton_manning
>>> df = load_peyton_manning()
>>> print(f"Shape: {df.shape}")
Shape: (2905, 2)
>>> print(f"Date range: {df['ds'].min().date()} to {df['ds'].max().date()}")
Date range: 2007-12-10 to 2016-01-20
Notes
-----
Data is downloaded from the Prophet examples repository on GitHub.
This is the same dataset used in the original Prophet paper and tutorials.
"""
url = "https://raw.githubusercontent.com/facebook/prophet/main/examples/example_wp_log_peyton_manning.csv"
df = pd.read_csv(url)
df["ds"] = pd.to_datetime(df["ds"])
return df
[docs]
def load_citi_bike_sales() -> pd.DataFrame:
"""Load the Citi Bike station 360 sales dataset.
This dataset contains daily bike ride counts from Citi Bike station 360
in New York City (2013-07-01 to 2014-10-31). It is used to demonstrate
forecasting short time series with transfer learning.
The dataset exhibits:
- Strong weekly seasonality (weekday vs weekend patterns)
- Yearly seasonality correlated with temperature/weather
- Approximately 3 months of initial data used for training (~106 days)
Returns
-------
pd.DataFrame
DataFrame with columns:
- ``ds``: datetime, daily timestamps from 2013-07-01 to 2014-10-31
- ``y``: float, number of bike rides
Examples
--------
>>> from vangja.datasets import load_citi_bike_sales
>>> df = load_citi_bike_sales()
>>> print(f"Shape: {df.shape}") # doctest: +SKIP
Shape: (488, 2)
Notes
-----
This dataset is from Tim Radtke's blog post "Modeling Short Time Series
with Prior Knowledge". The vangja library was partially inspired by this
work and Juan Orduz's PyMC implementation.
Requires the ``pyreadr`` package (install with ``pip install vangja[datasets]``).
References
----------
.. [1] Radtke, T. (2019). Modeling Short Time Series with Prior Knowledge.
https://minimizeregret.com/short-time-series-prior-knowledge
.. [2] Orduz, J. (2022). Modeling Short Time Series with Prior Knowledge in PyMC.
https://juanitorduz.github.io/short_time_series_pymc/
"""
try:
import pyreadr
except ImportError as e:
raise ImportError(
"pyreadr is required to load Citi Bike data. "
"Install with: pip install vangja[datasets]"
) from e
url = "https://github.com/timradtke/short-time-series/raw/master/citi_bike_360.Rds"
with tempfile.NamedTemporaryFile(suffix=".Rds") as tmp:
with urlopen(url) as resp:
tmp.write(resp.read())
tmp.flush()
rds_result = pyreadr.read_r(tmp.name)
df = rds_result[None] # RDS files have a single dataframe with key None
df = df.rename(columns={"date": "ds", "rides": "y"})
df["ds"] = pd.to_datetime(df["ds"])
# Remove rows with missing values
df = df.dropna(subset=["y"])
# Keep data before 2015-10-01
df = df[df["ds"] < "2015-10-01"]
return df[["ds", "y"]]
[docs]
def load_nyc_temperature(return_daily_average: bool = True) -> pd.DataFrame:
"""Load New York City historical daily temperature data.
This dataset contains daily maximum temperatures (Fahrenheit) for
New York City from 2012-10-01 to 2017-11-29. It is used to learn
yearly seasonality patterns that can be transferred to short time series.
The dataset exhibits:
- Strong yearly seasonality (summer highs, winter lows)
- Consistent periodic pattern across years
Parameters
----------
return_daily_average : bool, default True
If True, return daily average temperatures. If False, return raw hourly data.
Returns
-------
pd.DataFrame
DataFrame with columns:
- ``ds``: datetime, daily timestamps from 2012-10-01 to 2017-11-29
- ``y``: float, maximum daily temperature in Fahrenheit
Examples
--------
>>> from vangja.datasets import load_nyc_temperature
>>> df = load_nyc_temperature()
>>> print(f"Shape: {df.shape}") # doctest: +SKIP
Shape: (1886, 2)
Notes
-----
This dataset is from Tim Radtke's blog post "Modeling Short Time Series
with Prior Knowledge". The temperature seasonality can be used as prior
information for forecasting related short time series (e.g., bike sales).
References
----------
.. [1] Radtke, T. (2019). Modeling Short Time Series with Prior Knowledge.
https://minimizeregret.com/short-time-series-prior-knowledge
.. [2] Original data from Kaggle historical hourly weather data.
https://www.kaggle.com/selfishgene/historical-hourly-weather-data
"""
url = "https://raw.githubusercontent.com/timradtke/short-time-series/master/temperature.csv"
df = pd.read_csv(url)
df = df.rename(columns={"datetime": "ds", "New York": "y"})
df["ds"] = pd.to_datetime(df["ds"])
if return_daily_average:
df = df.resample("D", on="ds").mean().reset_index()
return df[["ds", "y"]]
[docs]
def load_stock_data(
tickers: list[str],
split_date: str | pd.Timestamp,
window_size: int,
horizon_size: int,
cache_path: Path | None = None,
interpolate: bool = False,
) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Load historical stock data split into training and test sets.
Downloads daily OHLCV data for the specified tickers using Yahoo
Finance and computes the typical price as
``(Open + High + Low + Close) / 4``. The data is split into a
training window and a test horizon around ``split_date``.
Parameters
----------
tickers : list[str]
List of ticker symbols to download (e.g., ``["AAPL", "MSFT"]``).
split_date : str or pd.Timestamp
The date separating training and test data. Training data
covers ``[split_date - window_size, split_date)`` and test
data covers ``[split_date, split_date + horizon_size]``.
window_size : int
Number of calendar days for the training window (before
``split_date``).
horizon_size : int
Number of calendar days for the test horizon (from
``split_date`` onwards).
cache_path : Path or None, default None
Directory for caching downloaded data. Each ticker is stored
as a CSV file. If None, data is downloaded without caching.
If provided, parent directories are created if they do not
exist.
interpolate : bool, default False
If True, missing days (weekends, holidays) within each series
are filled using linear interpolation after reindexing to a
daily calendar.
Returns
-------
tuple[pd.DataFrame, pd.DataFrame]
``(train_df, test_df)`` — DataFrames with columns:
- ``ds``: datetime
- ``y``: float, typical price
- ``series``: str, ticker symbol
Examples
--------
>>> from vangja.datasets import load_stock_data
>>> train, test = load_stock_data(
... ["AAPL"], "2024-01-01", window_size=365, horizon_size=30
... ) # doctest: +SKIP
>>> print(train.columns.tolist()) # doctest: +SKIP
['ds', 'y', 'series']
Notes
-----
Requires the ``yfinance`` package (install with
``pip install vangja[datasets]``).
"""
from vangja.datasets.stocks import _download_stock_data
split = pd.Timestamp(split_date)
start = split - pd.Timedelta(days=window_size - 1)
end = split + pd.Timedelta(days=horizon_size)
extended_start = start - pd.Timedelta(days=5)
extended_end = end + pd.Timedelta(days=5)
data = _download_stock_data(tickers, cache_path=cache_path)
if data.empty:
empty: pd.DataFrame = pd.DataFrame(columns=["ds", "y", "series"])
return empty, empty.copy()
# Build output DataFrame
result = data[["ds", "ticker", "typical_price"]].rename(
columns={"ticker": "series", "typical_price": "y"},
)
# Filter to requested date range
result = result[
(result["ds"] >= extended_start) & (result["ds"] <= extended_end)
].copy()
if interpolate:
interpolated: list[pd.DataFrame] = []
for ticker in result["series"].unique():
ticker_data = result[result["series"] == ticker].copy()
full_range = pd.date_range(start=extended_start, end=extended_end, freq="D")
ticker_data = ticker_data.set_index("ds").reindex(full_range)
ticker_data["y"] = ticker_data["y"].interpolate(method="linear")
ticker_data["series"] = ticker
ticker_data = ticker_data.reset_index().rename(
columns={"index": "ds"},
)
# Drop edges where forward/backward fill didn't reach
ticker_data = ticker_data.dropna(subset=["y"])
interpolated.append(ticker_data)
result = pd.concat(interpolated, ignore_index=True)
# Split into train and test
train_df = (
result[(result["ds"] >= start) & (result["ds"] <= split)]
.copy()
.reset_index(drop=True)
)
test_df = (
result[(result["ds"] > split) & (result["ds"] <= end)]
.copy()
.reset_index(drop=True)
)
return train_df, test_df
def _ensure_kagglehub() -> None:
"""Import kagglehub or raise a helpful ImportError."""
try:
import kagglehub # noqa: F401
except ImportError as e:
raise ImportError(
"kagglehub is required to download Kaggle datasets. "
"Install with: pip install vangja[datasets]"
) from e
KaggleTemperatureCity = Literal[
"Portland",
"San Francisco",
"Seattle",
"Los Angeles",
"San Diego",
"Las Vegas",
"Phoenix",
"Albuquerque",
"Denver",
"San Antonio",
"Dallas",
"Houston",
"Kansas City",
"Minneapolis",
"Saint Louis",
"Chicago",
"Nashville",
"Indianapolis",
"Atlanta",
"Detroit",
"Jacksonville",
"Charlotte",
"Miami",
"Pittsburgh",
"Philadelphia",
"New York",
"Boston",
"Vancouver",
"Toronto",
"Montreal",
"Beersheba",
"Tel Aviv District",
"Eilat",
"Haifa",
"Nahariyya",
"Jerusalem",
]
"""Valid city names in the Kaggle historical-hourly-weather-data temperature.csv."""
SmartHomeColumn = Literal[
"use [kW]",
"gen [kW]",
"House overall [kW]",
"Dishwasher [kW]",
"Furnace 1 [kW]",
"Furnace 2 [kW]",
"Home office [kW]",
"Fridge [kW]",
"Wine cellar [kW]",
"Garage door [kW]",
"Kitchen 12 [kW]",
"Kitchen 14 [kW]",
"Kitchen 38 [kW]",
"Barn [kW]",
"Well [kW]",
"Microwave [kW]",
"Living room [kW]",
"Solar [kW]",
]
"""Valid appliance / total columns in the Kaggle smart-home HomeC.csv."""
[docs]
def load_kaggle_temperature(
city: KaggleTemperatureCity = "New York",
start_date: str | pd.Timestamp | None = None,
end_date: str | pd.Timestamp | None = None,
freq: str = "D",
) -> pd.DataFrame:
"""Load historical hourly temperature data from Kaggle.
Downloads the ``temperature.csv`` file from the
`Historical Hourly Weather Data
<https://www.kaggle.com/datasets/selfishgene/historical-hourly-weather-data>`_
dataset. Returns data for the requested city, filtered to the given
date range and aggregated to the specified frequency.
The raw data contains hourly observations in **Kelvin**. Values are
converted to **Celsius** before returning.
Parameters
----------
city : KaggleTemperatureCity, default "New York"
City column to extract. Must be one of the 36 cities in the
dataset (see ``KaggleTemperatureCity``).
start_date : str, pd.Timestamp, or None, default None
Start of the date range (inclusive). If None, the earliest
available date is used (~2012-10-01).
end_date : str, pd.Timestamp, or None, default None
End of the date range (inclusive). If None, the latest
available date is used (~2017-11-30).
freq : str, default "D"
Pandas offset alias for temporal aggregation (e.g. ``"D"`` for
daily mean, ``"W"`` for weekly mean, ``"h"`` for hourly — no
aggregation). The aggregation function is ``mean``.
Returns
-------
pd.DataFrame
DataFrame with columns:
- ``ds``: datetime
- ``y``: float, temperature in degrees Celsius
- ``series``: str, the original city name from the Kaggle dataset
Raises
------
ImportError
If ``kagglehub`` is not installed.
Examples
--------
>>> from vangja.datasets import load_kaggle_temperature
>>> df = load_kaggle_temperature("New York", "2015-01-01", "2015-12-31") # doctest: +SKIP
>>> print(df.columns.tolist()) # doctest: +SKIP
['ds', 'y', 'series']
Notes
-----
Requires the ``kagglehub`` package (install with
``pip install vangja[datasets]``).
Data is downloaded and cached locally by ``kagglehub``. A valid
Kaggle API token is required (see
`Kaggle API docs <https://github.com/Kaggle/kaggle-api#api-credentials>`_).
References
----------
.. [1] Historical Hourly Weather Data.
https://www.kaggle.com/datasets/selfishgene/historical-hourly-weather-data
"""
_ensure_kagglehub()
import kagglehub
path = kagglehub.dataset_download(
"selfishgene/historical-hourly-weather-data",
)
csv_path = Path(path) / "temperature.csv"
df = pd.read_csv(csv_path, usecols=["datetime", city])
df = df.rename(columns={"datetime": "ds", city: "y"})
df["ds"] = pd.to_datetime(df["ds"])
# Convert Kelvin → Celsius
df["y"] = df["y"] - 273.15
# Drop missing values
df = df.dropna(subset=["y"])
# Filter date range
if start_date is not None:
df = df[df["ds"] >= pd.Timestamp(start_date)]
if end_date is not None:
df = df[df["ds"] <= pd.Timestamp(end_date)]
# Aggregate to requested frequency
df = df.resample(freq, on="ds").mean(numeric_only=True).reset_index()
df = df.dropna(subset=["y"])
# Add series column for compatibility with multi-series datasets (e.g. smart home)
df["series"] = city
return df[["ds", "y", "series"]]
[docs]
def load_smart_home_readings(
column: SmartHomeColumn | list[SmartHomeColumn] = "use [kW]",
start_date: str | pd.Timestamp | None = None,
end_date: str | pd.Timestamp | None = None,
freq: str | None = None,
) -> pd.DataFrame:
"""Load smart home energy readings from Kaggle.
Downloads the ``HomeC.csv`` file from the
`Smart Home Dataset with Weather Information
<https://www.kaggle.com/datasets/taranvee/smart-home-dataset-with-weather-information>`_
dataset. Returns data for the requested appliance or total column(s),
filtered to the given date range and aggregated to the specified
frequency.
The raw data has 1-minute resolution and covers roughly
2016-01-01 to 2016-12-16. Each column is in **kW**.
Parameters
----------
column : SmartHomeColumn or list[SmartHomeColumn], default "use [kW]"
The appliance or total column(s) to extract (see
``SmartHomeColumn``). When a single string is passed the
returned DataFrame has columns ``ds`` and ``y``. When a list
is passed the result is in long format with an additional
``series`` column identifying each appliance.
Common choices:
- ``"use [kW]"`` — total energy use
- ``"gen [kW]"`` — total energy generation
- ``"House overall [kW]"`` — house overall consumption
- ``"Dishwasher [kW]"``, ``"Fridge [kW]"``, etc. — individual
appliances
start_date : str, pd.Timestamp, or None, default None
Start of the date range (inclusive). If None, the earliest
available date is used (~2016-01-01).
end_date : str, pd.Timestamp, or None, default None
End of the date range (inclusive). If None, the latest
available date is used (~2016-12-16).
freq : str or None, default None
Pandas offset alias for temporal aggregation (e.g. ``"D"`` for
daily mean, ``"h"`` for hourly mean, ``"W"`` for weekly mean).
The aggregation function is ``mean``. If None, no aggregation
is performed and the original 1-minute data is returned.
Returns
-------
pd.DataFrame
DataFrame with columns:
- ``ds``: datetime
- ``y``: float, energy reading in kW
- ``series``: str *(only when ``column`` is a list)* —
the original column name from the Kaggle dataset
Raises
------
ImportError
If ``kagglehub`` is not installed.
Examples
--------
>>> from vangja.datasets import load_smart_home_readings
>>> df = load_smart_home_readings("Fridge [kW]", "2016-03-01", "2016-06-30") # doctest: +SKIP
>>> print(df.columns.tolist()) # doctest: +SKIP
['ds', 'y']
Multiple columns return a long-format DataFrame:
>>> df = load_smart_home_readings(
... ["Fridge [kW]", "Microwave [kW]"], freq="D"
... ) # doctest: +SKIP
>>> print(df.columns.tolist()) # doctest: +SKIP
['ds', 'y', 'series']
Notes
-----
Requires the ``kagglehub`` package (install with
``pip install vangja[datasets]``).
Data is downloaded and cached locally by ``kagglehub``. A valid
Kaggle API token is required (see
`Kaggle API docs <https://github.com/Kaggle/kaggle-api#api-credentials>`_).
The raw ``time`` column contains Unix timestamps. The last row of
the CSV may contain malformed data and is automatically dropped.
References
----------
.. [1] Smart Home Dataset with Weather Information.
https://www.kaggle.com/datasets/taranvee/smart-home-dataset-with-weather-information
"""
_ensure_kagglehub()
import kagglehub
columns: list[str] = [column] if isinstance(column, str) else list(column)
path = kagglehub.dataset_download(
"taranvee/smart-home-dataset-with-weather-information",
)
csv_path = Path(path) / "HomeC.csv"
df = pd.read_csv(csv_path, usecols=columns)
# Fix timestamps
df["ds"] = pd.date_range("2016-01-01 05:00", periods=len(df), freq="min")
# Coerce all value columns to numeric
for col in columns:
df[col] = pd.to_numeric(df[col], errors="coerce")
# Melt to long format
df = df.melt(id_vars="ds", value_vars=columns, var_name="series", value_name="y")
# Filter date range
if start_date is not None:
df = df[df["ds"] >= pd.Timestamp(start_date)]
if end_date is not None:
df = df[df["ds"] <= pd.Timestamp(end_date)]
# Aggregate to requested frequency
if freq is not None:
df = (
df.groupby("series")
.resample(freq, on="ds")
.mean(numeric_only=True)
.reset_index()
)
df = df.dropna(subset=["ds", "y"])
# Single column: return simple ds/y DataFrame (no series column)
if isinstance(column, str):
return df[["ds", "y"]].reset_index(drop=True)
return df[["ds", "y", "series"]].reset_index(drop=True)