Skip to content
Snippets Groups Projects
Verified Commit 60e3495d authored by Marco Aceti's avatar Marco Aceti
Browse files

Add 'none' agg-func and 'weekday' group by

parent d5c0d23e
No related branches found
No related tags found
No related merge requests found
Pipeline #2782 failed
......@@ -3,3 +3,5 @@ python-dateutil==2.8.2
tqdm==4.65.0
geojson==3.0.1
pandas==1.5.3
matplotlib==3.7.1
seaborn==0.12.2
import pandas as pd
from pandas.core.groupby.generic import DataFrameGroupBy
from src.const import LOCALE
def train_number(df: pd.DataFrame) -> DataFrameGroupBy:
"""Group by the dataframe by the train number."""
return df.groupby("number")
def train_hash(df: pd.DataFrame) -> DataFrameGroupBy:
"""Group the dataframe by the train hash."""
return df.groupby("train_hash")
def client_code(df: pd.DataFrame) -> DataFrameGroupBy:
"""Group the dataframe by the client code."""
df = df.loc[df.client_code != "OTHER"]
return df.groupby("client_code")
def weekday(df: pd.DataFrame) -> DataFrameGroupBy:
"""Group the dataframe by the (departure) weekday"""
df["weekday"] = df.day.dt.day_name(locale=LOCALE)
return df.groupby("weekday")
def agg_last(df_grouped: DataFrameGroupBy) -> pd.DataFrame:
......
......@@ -27,7 +27,7 @@ def read_train_csv(file: Path) -> pd.DataFrame:
infer_datetime_format=True,
)
df.client_code = df.client_code.apply(RailwayCompany.from_code) # type: ignore
df.day = df.day.apply(lambda dt: dt.date())
df.day = pd.to_datetime(df.day.apply(lambda dt: dt.date()))
return df.loc[(df.phantom == False) & (df.trenord_phantom == False)].drop(
["phantom", "trenord_phantom"], axis=1
)
......
......@@ -27,7 +27,9 @@ def register_args(parser: argparse.ArgumentParser):
help="group by stops by a value",
choices=(
"none",
"number",
"train_hash",
"client_code",
"weekday",
),
default="none",
)
......@@ -35,6 +37,7 @@ def register_args(parser: argparse.ArgumentParser):
"--agg-func",
help="group by aggregation function",
choices=(
"none",
"mean",
"last",
),
......@@ -43,17 +46,11 @@ def register_args(parser: argparse.ArgumentParser):
parser.add_argument(
"--stat",
help="the stat to calculate",
choices=("describe",),
default="describe",
)
parser.add_argument(
"--format",
help="output format",
choices=(
"human",
"csv",
"describe",
"delay_boxplot",
),
default="human",
default="describe",
)
parser.add_argument(
"station_csv",
......@@ -87,31 +84,35 @@ def main(args: argparse.Namespace):
train_df: pd.DataFrame = read_train_csv(pathlib.Path(train_csv))
df = pd.concat([df, train_df], axis=0)
logging.debug(f"Loaded {len(train_df)} data points @ {path}")
df.reset_index(drop=True, inplace=True)
stations: pd.DataFrame = read_station_csv(args.station_csv)
original_length: int = len(df)
# Apply filters
df = date_filter(df, start_date, end_date)
df: pd.DataFrame | DataFrameGroupBy = date_filter(df, start_date, end_date)
logging.info(f"Loaded {len(df)} data points ({original_length} before filtering)")
if args.group_by != "none":
df_grouped: DataFrameGroupBy | None = None
if args.group_by == "train_number":
df_grouped = groupby.train_number(df)
if args.group_by == "train_hash":
df_grouped = groupby.train_hash(df)
elif args.group_by == "client_code":
df_grouped = groupby.client_code(df)
elif args.group_by == "weekday":
df_grouped = groupby.weekday(df)
assert df_grouped is not None
if args.agg_func == "last":
df = df_grouped.last()
elif args.agg_func == "mean":
df = df_grouped.mean()
df = df_grouped.mean(numeric_only=True)
elif args.agg_func == "none":
df = df_grouped
if args.stat == "describe":
df = stat.describe(df)
if args.format == "human":
print(df)
elif args.format == "csv":
print(df.to_csv())
stat.describe(df)
elif args.stat == "delay_boxplot":
stat.delay_boxplot(df)
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pandas.core.groupby.generic import DataFrameGroupBy
from src.const import WEEKDAYS
def describe(df: pd.DataFrame) -> pd.DataFrame:
def describe(df: pd.DataFrame | DataFrameGroupBy) -> None:
"""Call pandas.DataFrame.describe()"""
return df[
[
print(df.describe())
def delay_boxplot(df: pd.DataFrame | DataFrameGroupBy) -> None:
"""Show a seaborn boxplot of departure and arrival delays"""
sns.set_theme(style="ticks", palette="pastel")
sns.set()
if isinstance(df, DataFrameGroupBy):
grouped_by: str = df.any().index.name
group_melt = pd.DataFrame()
grouped: list = list(df) # type: ignore
if grouped_by == "weekday":
# Re-order fields
grouped.sort(key=lambda t: WEEKDAYS[t[0]])
for group in grouped: # type: ignore
melt = pd.melt(
group[1],
id_vars=[
col
for col in df.obj.columns
if col
not in [
"arrival_delay",
"departure_delay",
"crowding",
]
].describe()
],
value_name="value",
)
group_melt = pd.concat([group_melt, melt])
ax = sns.boxplot(
group_melt[[grouped_by, "variable", "value"]],
x=grouped_by,
y="value",
hue="variable",
showfliers=False,
)
ax.set(xlabel=grouped_by, ylabel="Delay (minutes)")
elif isinstance(df, pd.DataFrame):
ax = sns.boxplot(
df[["arrival_delay", "departure_delay"]],
showfliers=False,
)
ax.set(xlabel="Variable", ylabel="Delay (minutes)")
plt.grid()
plt.show()
import locale
from enum import Enum
from dateutil import tz
......@@ -8,6 +9,27 @@ TIMEZONE = tz.gettz("GMT+1")
# Intra-day split hour
INTRADAY_SPLIT_HOUR: int = 4
# Pandas locale
LOCALE: str = "it_IT.utf-8"
# Weekdays
locale.setlocale(locale.LC_ALL, LOCALE)
def _w(weekday_number: int) -> str:
return locale.nl_langinfo(getattr(locale, f"DAY_{weekday_number}")).title()
WEEKDAYS = {
_w(2): 1, # Monday
_w(3): 2, # Tuesday
_w(4): 3, # Wednesday
_w(5): 4, # Thursday
_w(6): 5, # Friday
_w(7): 6, # Saturday
_w(1): 7, # Sunday
}
class RailwayCompany(Enum):
"""Italian railway companies codes."""
......@@ -21,8 +43,9 @@ class RailwayCompany(Enum):
OTHER = -1
@classmethod
def from_code(cls, code: int) -> "RailwayCompany":
def from_code(cls, code: int) -> str:
try:
return cls(code)
instance: "RailwayCompany" = cls(code)
except ValueError:
return cls.OTHER
instance: "RailwayCompany" = cls.OTHER
return instance.name
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment