Data Overview¶

Cargamos algunas de las librerias necesarias para nuestro análisis.

In [1]:
from __future__ import annotations

from pathlib import Path

import gc

import os
import base64
import hmac
import hashlib
from typing import Dict, Optional

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.transforms as transforms
import movingpandas as mpd
import numpy as np
import hvplot.pandas 

from pyproj import Geod

#mpd.show_versions() 
In [2]:
# Go to workspace root
def find_repo_root(start: Optional[Path] = None) -> Path:
    
    start = start or Path.cwd()
    markers = (".git", "pyproject.toml", "requirements.txt", "setup.cfg")
    cur = start.resolve()

    for parent in [cur, *cur.parents]:
        if any((parent / m).exists() for m in markers):
            return parent
    return cur

Ingestion¶

Ingestamos los archivos que tenemos disponibles

In [3]:
# Configure Ingestion

ROOT = find_repo_root()
DATA_DIR = "data/raw"
DATA_PATH = ROOT / DATA_DIR
CSV_PATH = sorted(DATA_PATH.glob("*.csv"))[0] #using any of our files

# CSV Labels
ID_COL = "nombre"
TIME_COL = "fecha"
LAT_COL = "lat"
LON_COL = "lon"
SPEED_COL = "vel"
In [4]:
# Read CSV file
df = pd.read_csv(CSV_PATH)
In [5]:
print(f"Raw data points: {len(df)} in {CSV_PATH}")
print(f"Raw vehicles: {len(df[ID_COL].unique())}")
Raw data points: 14138871 in C:\fabro\Projects\eda-notebook-workspace\data\raw\datos_telematica_2025-09-02.csv
Raw vehicles: 20673

Ofuscación¶

Como nuestro Notebook estará publicado en un repositorio público, vamos a ofuscar las identidades. Generamos un Hash a partir de los dominios de los vehículos y una clave privada. Recortamos el token final a 12 caracteres (suficiente para el parque de vehíclos que venimos trabajando ~20k).

In [6]:
# Needed functions

# Load KEY=VALUE
def load_kv_file(path: Path) -> Dict[str, str]:
    
    if not path.exists():
        return {}

    out: Dict[str, str] = {}
    for raw in path.read_text(encoding="utf-8").splitlines():
        line = raw.strip()
        if not line or line.startswith("#"):
            continue
        if "=" not in line:
            continue
        k, v = line.split("=", 1)
        k = k.strip()
        v = v.strip().strip("'").strip('"')
        out[k] = v
    return out

# Get Key from file
def get_hmac_key(
    env_var: str = "PLATE_HMAC_KEY_B64",
    secrets_relpath: str = "config/secrets.env",
) -> bytes:
    
    root = find_repo_root()
    secrets_path = root / secrets_relpath
    kv = load_kv_file(secrets_path)
    b64 = kv.get(env_var)

    if not b64:
        raise RuntimeError(
            f"No encontré {env_var}. Seteala en {secrets_relpath}."
        )

    try:
        key = base64.b64decode(b64, validate=True)
    except Exception as e:
        raise ValueError(f"{env_var} no es base64 válido: {e}")

    if len(key) < 16:
        raise ValueError(f"{env_var} demasiado corta ({len(key)} bytes). Usá >= 16, ideal 32.")

    return key

# HMAC -> Base62 -> token corto
_ALPH = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

def _b62_from_int(x: int) -> str:
    if x == 0:
        return _ALPH[0]
    s = []
    while x:
        x, r = divmod(x, 62)
        s.append(_ALPH[r])
    return "".join(reversed(s))

def _bytes_needed_for_base62_len(out_len: int) -> int:
    # bits ~= out_len * log2(62) ; log2(62) ≈ 5.954
    bits = int((out_len * 5.954242509439324) + 7)  # +7 para ceil/8
    return max(8, (bits + 7) // 8)  # mínimo 8 bytes (64b) por robustez

def plate_token_norm(norm_plate: str, key: bytes, out_len: int = 12) -> str:
    msg = (norm_plate or "").encode("utf-8")
    digest = hmac.new(key, msg, hashlib.sha256).digest()

    nbytes = _bytes_needed_for_base62_len(out_len)
    x = int.from_bytes(digest[:nbytes], "big")

    token = _b62_from_int(x).rjust(out_len, _ALPH[0])
    return token[:out_len]

def obfuscate_plate_column(df: pd.DataFrame, col: str, key: bytes, out_len: int = 12, out_col: str | None = None):
    out_col = out_col or col

    s = df[col].astype("string")
    norm = (
        s.fillna("")
         .str.strip()
         .str.upper()
         .str.replace(" ", "", regex=False)
         .str.replace("-", "", regex=False)
    )

    # Factorize
    codes, uniques = pd.factorize(norm, sort=False)

    # Unique tokens
    tokens = np.array([plate_token_norm(u, key=key, out_len=out_len) for u in uniques], dtype=object)

    # Replace column
    df[out_col] = tokens[codes]

    # Preserve NaN
    df.loc[s.isna(), out_col] = pd.NA

    return df
In [7]:
key = get_hmac_key()
df = obfuscate_plate_column(df, col=ID_COL, key=key, out_len=12)

del key
gc.collect()
Out[7]:
72

Data Cleaning and Filtering¶

In [8]:
df.head()
Out[8]:
nombre fecha idtipo idtipoexterno segundosdemorarecepcion lat lon vel rumbo
0 nzb6PvZHUPEZ 2025-09-02 02:46:41+00 0 1 4 NaN NaN NaN NaN
1 XuzAaa26JPjR 2025-09-02 02:46:42+00 0 1 3 -32.85436 -61.34860 82.0 266.0
2 OL7Nf8ksDelc 2025-09-02 02:46:41+00 0 1 4 -34.61745 -60.40863 0.0 NaN
3 zT7zfozMrWsX 2025-09-02 02:46:40+00 0 1006 5 -33.53368 -60.10854 82.0 319.0
4 TtPO38583MG3 2025-09-02 02:46:41+00 0 1 4 -34.31298 -60.23689 0.0 NaN

Basic Clean¶

Realizamos limpieza básica de los datos.

In [9]:
#Drop missing values
required_cols = [ID_COL, TIME_COL, LAT_COL, LON_COL, SPEED_COL]
df_clean = df.dropna(subset=required_cols).copy()
print(f"Removed {len(df) - len(df_clean)} rows with missing values in {required_cols}")
Removed 82908 rows with missing values in ['nombre', 'fecha', 'lat', 'lon', 'vel']
In [10]:
# Drop trajectories with fewer than two points
vehicles = df_clean[ID_COL].unique()
traj_sizes = df_clean.groupby(ID_COL).size()
valid_ids = traj_sizes[traj_sizes >= 2].index
df_clean = df_clean[df_clean[ID_COL].isin(valid_ids)]
print(f"Removed {len(vehicles) - len(valid_ids)} vehicles with <2 registries")
Removed 62 vehicles with <2 registries

Discard Home Ranged Vehicles¶

Descartamos vehículos que consideramos que están en un área operativa reducida o, posiblemente para el momento del análisis, dentro del home range si consideramos que todos realizan viajes de larga distancia.

In [11]:
# Define radius in km of the Home Range area
HOME_RANGE_THRESHOLD = 50

# Sort data by ID and Time
df_clean = df_clean.sort_values([ID_COL, TIME_COL])

# Aggregate data for vehicles
ends = df_clean.groupby(ID_COL).agg(
    lon_start=(LON_COL, "first"),
    lat_start=(LAT_COL, "first"),
    lon_end=(LON_COL, "last"),
    lat_end=(LAT_COL, "last"),
)

# Using pyproj to calculate distance using WGS84 projection
geod = Geod(ellps="WGS84")
_, _, dist_m = geod.inv(
    ends["lon_start"].values,
    ends["lat_start"].values,
    ends["lon_end"].values,
    ends["lat_end"].values,
)
ends["dist_km"] = dist_m / 1000.0

# Filter
ids_to_drop = ends.loc[ends["dist_km"] < HOME_RANGE_THRESHOLD].index
df_filtered = df_clean.loc[~df_clean[ID_COL].isin(ids_to_drop)].copy()

print(f"Dropped {len(ids_to_drop)} vehicles; remaining: {len(df_filtered[ID_COL].unique())}")
Dropped 13184 vehicles; remaining: 7373

Sampling¶

Tomamos una muestra aleatoria de los datos, utilizando el dominio del vehículo para filtrar.

In [12]:
PERCENT=0.10

# Sample vehicles and copy dataframe
vehicles = df_filtered[ID_COL].unique()
sampled_ids = pd.Series(vehicles).sample(frac=PERCENT, random_state=42)
df_sampled = df_filtered[df_filtered[ID_COL].isin(sampled_ids)].copy()

print(f"Total vehicles: {len(vehicles)} | Sampled: {len(sampled_ids)}")
Total vehicles: 7373 | Sampled: 737

Pre-Processing¶

En esta sección mejoramos/preparamos los datos para el análisis que vamos a realizar luego. No filtramos ni limpiamos sino que sacamos elementos que no serán necesarios para nuestro.

In [13]:
# Normalize timestamps to UTC
df_sampled[TIME_COL] = pd.to_datetime(df_sampled[TIME_COL], utc=True).dt.tz_localize(None)
In [14]:
# Trim 0 velocity ends of each vehicle 
df_sampled = df_sampled.sort_values([ID_COL, TIME_COL])
df_sampled["date"] = df_sampled[TIME_COL].dt.date

group_cols = [ID_COL, "date"]
value_cols = df_sampled.columns.difference(group_cols)

def trim_zero_edges(g):
    nz_idx = g.index[g[SPEED_COL].fillna(0) != 0]
    if nz_idx.empty: 
        return g.iloc[0:0]  # no movement that day -> drop all rows
    first, last = nz_idx[0], nz_idx[-1]
    return g.loc[first:last]

df_trimmed = (
    df_sampled.groupby(group_cols, group_keys=True)[value_cols]
      .apply(trim_zero_edges)
      .reset_index(level=[0, 1])   # bring group keys back as columns
      .reset_index(drop=True)
)

print(f"Trimmed {len(df_sampled) - len(df_trimmed)} rows")
Trimmed 65729 rows
In [15]:
# Convert to GeoPandas and use EPSG 3857 (UTM) projection
gdf = gpd.GeoDataFrame(
    df_trimmed,
    geometry=gpd.points_from_xy(df_trimmed[LON_COL], df_trimmed[LAT_COL], crs="EPSG:4326").to_crs("EPSG:3857"),
)

Inspect¶

Código auxiliar para inspeccionar datos.

Trajectory¶

In [16]:
DOMINIO="3UEGwOiggnWt"

g = gdf.loc[gdf[ID_COL] == DOMINIO, [ID_COL, TIME_COL, "geometry"]].copy()
if g.empty:
    raise ValueError(f"No se encontraron registros para ID_COL={DOMINIO}")

g = g.sort_values(TIME_COL).set_index(TIME_COL)

traj = mpd.Trajectory(g, traj_id=DOMINIO)

traj_gdf = traj.to_traj_gdf().to_crs(epsg=4326)

traj_gdf.hvplot(
    geo=True,
    tiles="CartoLight",
    rasterize=True,      # similar to datashade, hover on pixels
    line_width=1,
    alpha=0.9,
    #hover=False,         # skip tooltips for speed
    width=900,
    height=600,
)
Out[16]:

Analysis¶

Esta sección contiene cada una de las lineas de análisis de nuestros datos.

Sampling Quality¶

¿Qué busco con Sampling Quality? Analizar, describir y desarrollar métricas orientadas a eliminar el ruido de los datos que voy a usar para futuros modelos. No evaluamos calidad de la observación generada (GPS o cinemómetro) sino cantidad, frecuencia temporal y/o espacial, y otros atributos que podamos derivar.

Run Statistics¶

In [17]:
# Build trajectories
tc = mpd.TrajectoryCollection(gdf, traj_id_col=ID_COL, t=TIME_COL)

# Per-trajectory gap and speed stats (meters, m/s)
records = []
for traj in tc:
    time_series = (
        traj.df["timestamp"]
        if "timestamp" in traj.df.columns
        else traj.df.index.to_series()
    )
    time_series = pd.to_datetime(time_series)

    gaps = time_series.diff().dt.total_seconds()
    traj.add_distance()
    traj.add_speed()

    n_points = len(traj.df)
    coverage_s = (time_series.max() - time_series.min()).total_seconds()
    
    # Main measures

    records.append({
        "id": traj.id,
        "n_points": n_points,
        "coverage_s" : coverage_s,
        "median_gap_s": gaps.median(),
        "p95_gap_s": gaps.quantile(0.95),
        "median_dist_m": traj.df["distance"].median(),
        "p95_dist_m": traj.df["distance"].quantile(0.95),
        "median_speed_mps": traj.df["speed"].median(),
        "p95_speed_mps": traj.df["speed"].quantile(0.95),
        "pct_speed_over_50kph": (traj.df["speed"] > 50/3.6).mean(),
    })

stats = pd.DataFrame(records)
In [18]:
# Basics
stats["coverage_hours"] = stats["coverage_s"] / 3600.0
stats["pts_per_hour"] = stats["n_points"] / stats["coverage_hours"]

# Expected points and Sampling Efficiency
stats["expected_points_median_gap"] = stats["coverage_s"] / stats["median_gap_s"]
stats["sampling_efficiency"] = stats["n_points"] / stats["expected_points_median_gap"]  # ~1 si consistente

# Consistency between distance/gap vs speed
stats["median_speed_kph"] = stats["median_speed_mps"] * 3.6
stats["p95_speed_kph"]    = stats["p95_speed_mps"] * 3.6
stats["implied_median_speed_kph"] = (stats["median_dist_m"] / (stats["median_gap_s"])) * 3.6
stats["implied_p95_speed_kph"]    = (stats["p95_dist_m"]    / (stats["p95_gap_s"])) * 3.6
stats["delta_median_speed_kph"] = stats["implied_median_speed_kph"] - stats["median_speed_kph"]
stats["delta_p95_speed_kph"]    = stats["implied_p95_speed_kph"]    - stats["p95_speed_kph"]

# Tail / Median sampling time gap ratio
stats["gap_tail_ratio"] = stats["p95_gap_s"] / stats["median_gap_s"]

Plotting + Description¶

Ejecutamos algunas gráficas para visualizar la dispersión de los datos en base a algunas medidas propuestas.

In [19]:
# Left Plot
x = stats["coverage_hours"].to_numpy()
y = stats["n_points"].to_numpy()

# Run a linear regression
a, b = np.polyfit(x, y, 1)

# Fit data to graph our function
x_fit = np.linspace(x.min(), x.max(), 200)
y_fit = a * x_fit + b

# R^2
y_pred = a * x + b
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2 = 1 - ss_res / ss_tot if ss_tot > 0 else np.nan

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.scatter(x, y, c="cadetblue",alpha=0.7)
ax1.plot(x_fit, y_fit, color="coral", linewidth=2)

ax1.set_yscale("log")
ax1.set_xlabel("coverage_hours")
ax1.set_ylabel("n_points (log)")
ax1.set_title("Coverage vs Num. of Points")
ax1.grid(True, alpha=0.3)

eq = f"log(y) = log({a:.4f}·x + {b:.4f})\nR² = {r2:.4f}"
ax1.text(
    0.025, 0.97, eq,
    transform=ax1.transAxes,
    va="top", ha="left",
    color="crimson",
    bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)

# Right Plot
mean_x = stats["median_gap_s"].mean()
mean_y = stats["pts_per_hour"].mean()
tx = transforms.blended_transform_factory(ax2.transData, ax2.transAxes)  # x=data, y=axes
ty = transforms.blended_transform_factory(ax2.transAxes, ax2.transData)  # x=axes, y=data
ax2.scatter(stats["median_gap_s"], stats["pts_per_hour"], alpha=.7)
ax2.set_xscale("log"); ax2.set_yscale("log")
ax2.set_xlabel("median_gap_s (log)")
ax2.set_ylabel("pts_per_hour (log)")
ax2.axvline(mean_x, color="coral", linestyle=":", linewidth=2)
ax2.axhline(mean_y, color="coral", linestyle=":", linewidth=2)
ax2.set_title("Sampling Gap vs Point Density")
ax2.grid(True, alpha=.3)

ax2.text(
    mean_x, 0.98, f"μx = {mean_x:.2f}",
    transform=tx, rotation=90,
    va="top", ha="right", color="red",
    bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)

ax2.text(
    0.98, mean_y, f"μy = {mean_y:.2f}",
    transform=ty,
    va="top", ha="right", color="red",
    bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)

plt.tight_layout()
plt.show()
No description has been provided for this image

Coverage vs Num. of Points:

  • Cobertura baja + pocos puntos => “vehículo con poca actividad / ventana corta”.
  • Cobertura alta + pocos puntos => mal muestreo.
  • Regresión y R^2 = Baja relación lineal entre cobertura y número de puntos.
    • ¿Una flota uniforme debería ajustarse mejor?

Sampling Gap vs Point Density:

  • ¿Qué es lo que buscamos? Outliers con median_gap bajo y peropts_per_hour bajo.
  • Puede implicar gap mal estimado o timestamps raros.
In [20]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Left Plot
x = stats["sampling_efficiency"]

# Trim tails (1% - 99%)
lo, hi = x.quantile([0.01, 0.99])
x_plot = x.clip(lo, hi)

ax1.hist(x_plot, bins=50, color="darkseagreen", alpha=0.8, rwidth=0.92)

ax1.axvline(1.0, linewidth=1, color="crimson")
ax1.axvline(x.median(), linestyle="--", linewidth=2)
ax1.axvline(x.quantile(0.95), linestyle=":", linewidth=2)

ax1.set_xlabel("sampling_efficiency")
ax1.set_title("Sampling Efficiency")
ax1.grid(True, alpha=.25)
ax1.set_axisbelow(True)

ax1.text(
    0.02, 0.97,
    f"median={x.median():.2f}\nP95={x.quantile(0.95):.2f}\nP99={x.quantile(0.99):.2f}",
    transform=ax1.transAxes, va="top", ha="left",
    bbox=dict(facecolor="white", alpha=0.85, edgecolor="0.7")
)

# Right Plot
g = stats["gap_tail_ratio"]
g_log = np.log10(g)

# Trim tails (1% - 99%)
lo, hi = g_log.quantile([0.01, 0.99])
g_plot = g_log.clip(lo, hi)

ax2.hist(g_plot, bins=40, color="mediumseagreen", alpha=0.8, rwidth=0.92)

ax2.axvline(0.0, linewidth=1, color="crimson")
ax2.axvline(g_log.median(), linestyle="--", linewidth=2)
ax2.axvline(g_log.quantile(0.95), linestyle=":", linewidth=2)

ax2.set_xlabel("log10(p95_gap / median_gap)")
ax2.set_title("Sample Irregularity (Tail ratio)")
ax2.grid(True, alpha=.25)
ax2.set_axisbelow(True)

ax2.text(
    0.83, 0.97,
    f"median={g_log.median():.2f}\nP95={g_log.quantile(0.95):.2f}\nP99={g_log.quantile(0.99):.2f}",
    transform=ax2.transAxes, va="top", ha="left",
    bbox=dict(facecolor="white", alpha=0.85, edgecolor="0.7")
)

plt.tight_layout()
plt.show()
No description has been provided for this image

Sampling Efficiency (Ideal 1)

  • 1: el numero de puntos esperados (tiempo/media de gap) es el número de puntos que tenemos.
  • <1: faltan puntos respecto a lo que sugiere el gap.
  • >1: duplicados, bursts, o median_gap subestima.

Sample Irregularity (Tail ratio) (Ideal 0)

  • Percentil 95 de gaps / Media de gap
  • Ratio alto = Corte de la regularidad (¿faltan muestras?)

Outliers acá suelen ser saltos GPS o timestamps con problemas.

In [21]:
cols = ["median_speed_kph", "implied_median_speed_kph", "gap_tail_ratio","sampling_efficiency","delta_median_speed_kph"]
df = stats[cols]

# Color setup
c = np.log10(df["gap_tail_ratio"])
c_lo, c_hi = np.nanquantile(c, [0.01, 0.99])
c_plot = np.clip(c, c_lo, c_hi)

# Identity scatter limits
lim = np.nanmax([df["median_speed_kph"].max(), df["implied_median_speed_kph"].max()])
lims = (0, lim)

# Tolerance
tol1 = 0.10  # ±10%
tol2 = 0.25  # ±25%

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# =========================
# (1) Consistency
# =========================
x_band = np.linspace(max(lims[0], 0), lims[1], 300)

# ±10% Zone
y_low1  = (1 - tol1) * x_band
y_high1 = (1 + tol1) * x_band
ax1.fill_between(x_band, y_low1, y_high1, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")

# ±25% Zone
y_low2  = (1 - tol2) * x_band
y_high2 = (1 + tol2) * x_band
ax1.fill_between(x_band, y_low2, y_low1,  color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax1.fill_between(x_band, y_high1, y_high2, color="0.92", alpha=0.9)

# Identity
ax1.plot(x_band, x_band, color="0.6", linewidth=1)

sc1 = ax1.scatter(df["median_speed_kph"], df["implied_median_speed_kph"], c=c_plot, alpha=0.6, s=30)

ax1.set_xlim(lims); ax1.set_ylim(lims)
ax1.set_aspect("equal", adjustable="box")
ax1.set_xlabel("median_speed_kph (reported)")
ax1.set_ylabel("median_speed_kph (implicit dist/gap)")
ax1.set_title("Speed Consistency")
ax1.grid(True, alpha=0.25)
ax1.legend(loc="upper left", framealpha=0.9)


# =========================
# (2) Residuals
# =========================
x_band2 = np.linspace(df["median_speed_kph"].min(), df["median_speed_kph"].max(), 300)
ax2.fill_between(x_band2, -tol1*x_band2,  tol1*x_band2,  color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
ax2.fill_between(x_band2, -tol2*x_band2, -tol1*x_band2, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax2.fill_between(x_band2,  tol1*x_band2,  tol2*x_band2, color="0.92", alpha=0.9)

ax2.axhline(0, color="0.6", linewidth=1)

sc2 = ax2.scatter(df["median_speed_kph"], df["delta_median_speed_kph"], c=c_plot, alpha=0.6, s=30)

ax2.set_xlabel("median_speed_kph (reported)")
ax2.set_ylabel("delta_kph (implicit - reported)")
ax2.set_title("Residuals of Implied vs Reported")
ax2.grid(True, alpha=0.25)
ax2.legend(loc="lower right", framealpha=0.9)

cb2 = plt.colorbar(sc2, ax=ax2)
cb2.set_label("log10(gap_tail_ratio)")

fig.suptitle(
    "Speed Analysis (Color by gap_tail_ratio)",
    fontsize=14
)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [22]:
cols = ["median_speed_kph", "implied_median_speed_kph", "gap_tail_ratio","sampling_efficiency","delta_median_speed_kph"]
df = stats[cols]

# Color setup
eff = df["sampling_efficiency"].to_numpy()

vmin = np.quantile(eff, 0.05)  
vmax = np.quantile(eff, 0.99)
vcenter = np.mean(eff)
vcenter = np.clip(vcenter, vmin, vmax)
norm = mcolors.TwoSlopeNorm(vmin=vmin, vcenter=vcenter, vmax=vmax)
eff_clip = np.clip(eff, vmin, vmax)

# Tolerance
tol1 = 0.10  # ±10% 
tol2 = 0.25  # ±25% 

# Identity scatter limits
lim = np.nanmax([df["median_speed_kph"].max(), df["implied_median_speed_kph"].max()])
lims = (0, lim)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# =========================
# (1) Consistency
# =========================
x_band = np.linspace(max(lims[0], 0), lims[1], 300)

# ±10% Zone
y_low1  = (1 - tol1) * x_band
y_high1 = (1 + tol1) * x_band
ax1.fill_between(x_band, y_low1, y_high1, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")

# ±25% Zone
y_low2  = (1 - tol2) * x_band
y_high2 = (1 + tol2) * x_band
ax1.fill_between(x_band, y_low2, y_low1,  color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax1.fill_between(x_band, y_high1, y_high2, color="0.92", alpha=0.9)

# Identity
ax1.plot(x_band, x_band, color="0.6", linewidth=1)

sc1 = ax1.scatter(
    df["median_speed_kph"], df["implied_median_speed_kph"],
    c=eff_clip, norm=norm, cmap="coolwarm_r",
    alpha=0.65, s=30
)

ax1.set_xlim(lims); ax1.set_ylim(lims)
ax1.set_aspect("equal", adjustable="box")
ax1.set_xlabel("median_speed_kph (reported)")
ax1.set_ylabel("median_speed_kph (implicit dist/gap)")
ax1.set_title("Speed Consistency")
ax1.grid(True, alpha=0.25)
ax1.set_axisbelow(True)
ax1.legend(loc="lower right", framealpha=0.9)

# =========================
# (2) Residuals
# =========================
x_band2 = np.linspace(df["median_speed_kph"].min(), df["median_speed_kph"].max(), 300)
ax2.fill_between(x_band2, -tol1*x_band2,  tol1*x_band2,  color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
ax2.fill_between(x_band2, -tol2*x_band2, -tol1*x_band2, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax2.fill_between(x_band2,  tol1*x_band2,  tol2*x_band2, color="0.92", alpha=0.9)

ax2.axhline(0, color="0.6", linewidth=1)

sc2 = ax2.scatter(
    df["median_speed_kph"], df["delta_median_speed_kph"],
    c=eff_clip, norm=norm, cmap="coolwarm_r",
    alpha=0.65, s=30
)

ax2.set_xlabel("median_speed_kph (reported)")
ax2.set_ylabel("delta_kph (implicit - reported)")
ax2.set_title("Residuals of Implied vs Reported")
ax2.grid(True, alpha=0.25)
ax2.set_axisbelow(True)
ax2.legend(loc="upper left", framealpha=0.9)

cb2 = plt.colorbar(sc2, ax=ax2)
cb2.set_label("sampling_efficiency")
cb2.set_ticks([vmin, vcenter, vmax])

fig.suptitle(
    "Speed Analysis (Color by sampling_efficiency)",
    fontsize=14
)

plt.tight_layout()
plt.show()
No description has been provided for this image

Algunas cosas que observamos:

  • Concentración cerca de la diagonal (zona ±10%), especialmente en el rango medio–alto de velocidades.
  • La velocidad implícita suele subestimar la reportada, y el efecto se acentúa a velocidades mayores.
  • El error crece con la velocidad.
  • gap_tail_ratio se alinea bien con los problemas de consistencia (la velocidad reportada es menor que la implicita)
  • sampling_efficiency evidencia que hay menos puntos que los esperables en todo el rango de velocidades.

Algunas cosas para investigar a futuro:

  • Segmentación de recorridos.
  • Detección de paradas.
  • Generalizaciones para zonas sin datos.
In [23]:
cols = ["id","median_speed_kph","implied_median_speed_kph","gap_tail_ratio","sampling_efficiency","n_points","coverage_hours","delta_median_speed_kph"]
df = stats[cols]

# Definition “out of band”
# For low speed we set a minimum threshold.
MIN_SPEED = 10 
df_eval = df[df["median_speed_kph"] >= MIN_SPEED].copy()
df_eval["abs_delta_kph"] = df_eval["delta_median_speed_kph"].abs()
df_eval["abs_rel_err"] = df_eval["abs_delta_kph"] / df_eval["median_speed_kph"].clip(lower=1e-6)

# Tolerance
tol10, tol25 = 0.10, 0.25
df_eval["out_10pct"] = df_eval["abs_delta_kph"] > (tol10 * df_eval["median_speed_kph"])
df_eval["out_25pct"] = df_eval["abs_delta_kph"] > (tol25 * df_eval["median_speed_kph"])

# Speed Bins
bins = [MIN_SPEED, 20, 40, 60, 80, 100, 120, 160, np.inf]
labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
df_eval["speed_bin"] = pd.cut(df_eval["median_speed_kph"], bins=bins, labels=labels, right=False)

summary = (
    df_eval.groupby("speed_bin", observed=False)
    .agg(
        n=("id","count"),
        out_10pct_rate=("out_10pct", "mean"),
        out_25pct_rate=("out_25pct", "mean"),
        median_abs_delta=("abs_delta_kph","median"),
        p95_abs_delta=("abs_delta_kph", lambda s: s.quantile(0.95)),
        median_gap_tail_ratio=("gap_tail_ratio","median"),
        median_sampling_eff=("sampling_efficiency","median"),
    )
    .reset_index()
)

# Round rates
summary["out_10pct_rate"] = (summary["out_10pct_rate"]).round(4)
summary["out_25pct_rate"] = (summary["out_25pct_rate"]).round(4)

summary
Out[23]:
speed_bin n out_10pct_rate out_25pct_rate median_abs_delta p95_abs_delta median_gap_tail_ratio median_sampling_eff
0 10-20 66 0.9091 0.8636 9.917205 14.965177 4.730206 0.641597
1 20-40 112 0.8929 0.8214 15.579568 27.303946 4.078311 0.576172
2 40-60 80 0.8000 0.6000 16.188403 35.582364 3.457812 0.554763
3 60-80 103 0.6699 0.5049 17.111441 53.443365 2.542373 0.667343
4 80-100 202 0.3465 0.2277 2.701474 59.466536 1.896023 0.729820
5 100-120 48 0.3125 0.2292 1.865364 60.945672 1.355311 0.699619
6 120-160 5 0.6000 0.2000 12.268467 33.681922 1.016667 0.721329
7 160-inf 0 NaN NaN NaN NaN NaN NaN

Scoring¶

Seleccionamos Top N vehículos en función de una métrica de interés para empezar a entender nuestros outliers.

In [24]:
SCORING_BY = "abs_rel_err"

top = (
    df_eval.assign(score=df_eval[SCORING_BY])
    .sort_values("score", ascending=False)
    .head(20)
    .copy()
)

cols_show = [
    "id",
    "median_speed_kph",
    "implied_median_speed_kph",
    "delta_median_speed_kph",
    "abs_delta_kph",
    "abs_rel_err",
    "gap_tail_ratio",
    "sampling_efficiency",
    "n_points",
    "coverage_hours",
]

top[cols_show].assign(
    abs_rel_err=lambda d: (d["abs_rel_err"]).round(3),  # rate error
    delta_median_speed_kph=lambda d: d["delta_median_speed_kph"].round(1),
    abs_delta_kph=lambda d: d["abs_delta_kph"].round(1),
    median_speed_kph=lambda d: d["median_speed_kph"].round(1),
    implied_median_speed_kph=lambda d: d["implied_median_speed_kph"].round(1),
    gap_tail_ratio=lambda d: d["gap_tail_ratio"].round(2),
    sampling_efficiency=lambda d: d["sampling_efficiency"].round(2),
    coverage_hours=lambda d: d["coverage_hours"].round(2),
).rename(columns={"abs_rel_err":"abs_rel_err_rate"})
Out[24]:
id median_speed_kph implied_median_speed_kph delta_median_speed_kph abs_delta_kph abs_rel_err_rate gap_tail_ratio sampling_efficiency n_points coverage_hours
333 3UEGwOiggnWt 20.6 43.0 22.4 22.4 1.088 34.00 0.05 3652 20.39
666 Y1qBOymx8YGb 28.9 59.3 30.4 30.4 1.052 30.00 0.06 2143 10.67
425 4MB0oH5pe80M 23.7 1.0 -22.7 22.7 0.957 6.58 0.56 164 14.62
591 5oPliRq1UI1v 12.6 0.6 -12.0 12.0 0.952 2.29 1.07 783 21.28
188 2S4buidKVaeo 10.4 0.5 -9.9 9.9 0.950 2.52 1.05 501 15.98
199 2UxsOtbEaYvc 10.4 0.5 -9.8 9.8 0.947 2.67 1.01 434 13.55
198 2UNu3nAvubkL 20.0 1.1 -18.9 18.9 0.944 3.22 0.90 395 11.40
576 5edKs4Tl9QiP 13.4 0.8 -12.5 12.5 0.938 2.66 0.99 372 11.79
95 1lGQ1tCRHXrn 11.0 0.7 -10.3 10.3 0.933 5.01 0.56 261 23.42
46 1Kyg7cHK3Mpi 12.5 0.9 -11.6 11.6 0.930 5.36 0.66 294 20.90
73 1YoE4zWDpLth 20.8 1.5 -19.4 19.4 0.929 2.77 1.14 805 21.23
91 1kENfr6FX4UD 32.4 2.7 -29.6 29.6 0.915 2.96 0.76 1017 14.39
109 1qBbPK8P73tB 23.4 2.0 -21.4 21.4 0.913 4.97 0.68 287 21.14
343 3ZZrHcZzX2eI 19.5 1.9 -17.7 17.7 0.904 2.34 0.93 624 23.97
495 4zQiEZ6eO8jr 10.5 1.0 -9.4 9.4 0.902 4.95 0.65 799 20.95
391 43m3itVV18e6 26.7 2.6 -24.1 24.1 0.901 1.02 1.52 655 14.19
471 4kiueP9NP0EV 17.0 1.7 -15.3 15.3 0.901 1.07 1.15 1457 21.12
724 xkvxf4FjNsv9 14.2 1.4 -12.7 12.7 0.900 4.56 0.65 542 15.38
631 GZcaYHMAWfYq 35.5 3.6 -31.9 31.9 0.898 2.70 0.93 676 17.95
256 2wOOQjytBs5k 60.7 6.3 -54.4 54.4 0.896 1.22 1.34 2035 21.17

Proponemos otra estrategia de detección de outliers mediante Outlier Scoring. Para ello vamos a utilizar una ponderación de las siguientes medidas:

  • Sampling Score

    • Promedio de tres señales robustas.
    • Mide anomalía de muestreo (cortes + irregularidad + eficiencia).
    • Severidad de cortes: p95_gap_s alto (en log)
    • Intermitencia: gap_tail_ratio = p95_gap_s / median_gap_s alto (en log)
    • Eficiencia de muestreo: sampling_efficiency lejos del valor típico (ideal ≈ 1, pero se mide como outlier robusto)
  • GPS Score

    • Mide anomalía espacial compatible con saltos GPS, errores de geolocalización o puntos espurios.
    • Severidad robusta del tamaño de paso extremo usando p95_dist_m en log.
    • Valores altos indican que el vehículo presenta distancias entre puntos anormalmente grandes en el percentil alto, lo cual suele asociarse a saltos de posición, mala señal o glitches.
  • Consistency score

    • Severidad robusta del error típico.
    • Mide la inconsistencia entre velocidad reportada y velocidad implícita calculada.
    • Valores altos pueden indicar complejidad para modelar el movimiento (error de distancias, muestreo irregular).

Nota: Usamos el estadístico z_robusto aplicado a las medidas, dicha herramienta es frecuentemente utilizada como método de detección de outliers en textos de EDA.

To detect univariate outliers robustly, we used the modified z‑score based on the median absolute deviation (MAD). Following standard practice in robust statistics (Iglewicz & Hoaglin), observations with |M_i| > 3.5 were considered potential outliers. The constant 0.6745 scales MAD to be consistent with the standard deviation under normality.

In [25]:
def robust_z(x: pd.Series) -> pd.Series:
    x = x.astype(float)
    med = np.nanmedian(x)
    mad = np.nanmedian(np.abs(x - med)) + 1e-9
    return 0.6745 * (x - med) / mad

def safe_log10(s: pd.Series) -> pd.Series:
    s = s.astype(float)
    return np.log10(s.where(s > 0))

def clip_abs(s: pd.Series, cap: float = 6.0) -> pd.Series:
    """Capea para que un solo feature no domine todo el score."""
    return s.abs().clip(upper=cap)

# -------------------------
# Ensure required columns
# -------------------------
required = ["id","p95_gap_s","gap_tail_ratio","p95_dist_m","sampling_efficiency","n_points","coverage_hours",
            "median_speed_kph","implied_median_speed_kph", "delta_median_speed_kph"]

df = stats[required].copy()
df["abs_delta_median_speed_kph"] = df["delta_median_speed_kph"].abs()

# -------------------------
# Robust z-scores
# -------------------------
z = pd.DataFrame(index=df.index)

# Sampling irregularity / gaps (log)
z["z_log_p95_gap_s"]     = robust_z(safe_log10(df["p95_gap_s"]))
z["z_log_gap_tail_ratio"] = robust_z(safe_log10(df["gap_tail_ratio"]))

# Sampling efficiency (sin log; buscamos extremos respecto del fleet)
z["z_sampling_eff"] = robust_z(df["sampling_efficiency"])

# GPS jumps proxy (log dist)
z["z_log_p95_dist_m"] = robust_z(safe_log10(df["p95_dist_m"]))

# Consistency (error en velocidad)
z["z_abs_delta_speed"] = robust_z(df["abs_delta_median_speed_kph"])

# -------------------------
# Component scores (cap + average)
# -------------------------
CAP = 6.0

sampling_score = (
    clip_abs(z["z_log_p95_gap_s"], CAP) +
    clip_abs(z["z_log_gap_tail_ratio"], CAP) +
    clip_abs(z["z_sampling_eff"], CAP)
) / 3.0

gps_score = clip_abs(z["z_log_p95_dist_m"], CAP)

consistency_score = clip_abs(z["z_abs_delta_speed"], CAP)

# -------------------------
# Reliability weight (para no sobrerreaccionar con pocos datos)
# -------------------------
# Ajustá estos umbrales a tu caso (son razonables para 1 día de telemetría)
MIN_POINTS_FOR_FULL = 200
MIN_COV_H_FOR_FULL  = 4.0

w_points = np.clip(np.log10(df["n_points"].clip(lower=2)) / np.log10(MIN_POINTS_FOR_FULL), 0, 1)
w_cov    = np.clip(df["coverage_hours"] / MIN_COV_H_FOR_FULL, 0, 1)

reliability_w = (w_points * w_cov).fillna(0)

# -------------------------
# Total outlier score (pesos por familia)
# -------------------------
W_SAMPLING = 1.0
W_GPS      = 1.0
W_CONS     = 1.5  # suele ser más “grave” para tu diagnóstico de consistencia

raw_score = (W_SAMPLING * sampling_score) + (W_GPS * gps_score) + (W_CONS * consistency_score)
outlier_score = raw_score * reliability_w

# -------------------------
# Output table + flags
# -------------------------
out = pd.DataFrame({
    "id": df["id"],
    "sampling_score": sampling_score,
    "gps_score": gps_score,
    "consistency_score": consistency_score,
    "raw_score": raw_score,
    "reliability_w": reliability_w,
    "outlier_score": outlier_score,

    "sampling_efficiency": df["sampling_efficiency"],
    "gap_tail_ratio": df["gap_tail_ratio"],
    "p95_gap_s": df["p95_gap_s"],
    "p95_dist_m": df["p95_dist_m"],
    "abs_delta_median_speed_kph": df["abs_delta_median_speed_kph"],
    "n_points": df["n_points"],
    "coverage_hours": df["coverage_hours"],
})

# flags MAD-z > 3.5 (regla práctica)
TH = 3.5
out["flag_sampling"]    = (z["z_log_p95_gap_s"].abs() > TH) | (z["z_log_gap_tail_ratio"].abs() > TH) | (z["z_sampling_eff"].abs() > TH)
out["flag_gps"]         = (z["z_log_p95_dist_m"].abs() > TH)
out["flag_consistency"] = (z["z_abs_delta_speed"].abs() > TH)

# Top N outliers
out.sort_values("outlier_score", ascending=False).head(25)
Out[25]:
id sampling_score gps_score consistency_score raw_score reliability_w outlier_score sampling_efficiency gap_tail_ratio p95_gap_s p95_dist_m abs_delta_median_speed_kph n_points coverage_hours flag_sampling flag_gps flag_consistency
181 2Nrtauu0kF4Y 0.466861 1.277099 6.000000 10.743960 1.0 10.743960 0.701068 4.448529 605.00 6200.494278 72.287795 392 21.123333 False False True
688 jAESbgsgb8vp 1.503399 0.206251 6.000000 10.709650 1.0 10.709650 0.200353 40.000000 600.00 1626.077933 74.032635 931 19.361667 False False True
618 6n6r55ztR50L 0.289181 1.127169 6.000000 10.416350 1.0 10.416350 0.745604 3.703704 300.00 5415.927354 72.533799 702 21.184167 False False True
376 3wsd1LVYPxrr 0.575109 0.664936 6.000000 10.240045 1.0 10.240045 0.616529 1.875000 30.00 1074.967276 73.062759 1367 9.854444 False False True
372 3qfQAgdkd10D 0.652426 0.303406 6.000000 9.955832 1.0 9.955832 0.356332 4.991667 89.85 1489.598119 76.256423 1533 21.510833 False False True
566 5ZSBX8WIm5fg 0.496187 1.031908 5.536344 9.832612 1.0 9.832612 1.037190 2.281369 300.00 4969.841005 65.407326 581 20.461667 False False True
160 2CwyANdzkUrA 0.614027 0.122079 6.000000 9.736106 1.0 9.736106 0.954160 1.848485 61.00 2186.780331 73.540335 890 8.550278 False False True
403 49G3rMQ9HZGo 0.380000 0.627262 5.802354 9.710793 1.0 9.710793 0.776722 4.615385 300.00 3449.632403 68.191449 816 18.968611 False False True
144 26JvjyZhrZHq 0.358238 0.971186 5.360266 9.369823 1.0 9.369823 0.918966 2.510460 300.00 4704.867883 63.564446 587 21.203333 False False True
200 2VQjbvCWyNSQ 0.253590 0.860352 5.378720 9.182022 1.0 9.182022 0.595578 3.485714 122.00 4257.107601 63.757589 698 11.394167 False False True
665 Xejt5XDWB68Z 0.842640 0.069291 5.454461 9.093623 1.0 9.093623 0.284173 5.545455 61.00 1839.977427 64.550316 1209 12.999722 False False True
467 4iNWUPtXU3IH 0.469167 0.942165 5.108229 9.073675 1.0 9.073675 1.000646 2.222222 300.00 4583.262428 60.926558 436 16.339444 False False True
272 34P0C6N0jwsv 0.487800 0.092410 5.608647 8.993181 1.0 8.993181 0.482971 3.000000 60.00 1801.991926 66.164067 1801 20.716667 False False True
143 267wFnyIxqAn 0.997715 1.251219 4.470444 8.954600 1.0 8.954600 0.910156 1.052632 20.00 633.354666 54.251332 2539 14.723056 False False True
56 1NodC6MiUG6A 0.292117 1.111426 4.974204 8.864849 1.0 8.864849 0.776739 3.370787 300.00 5339.538726 59.523816 490 15.595833 False False True
588 5lV9qKye5C2I 0.310671 0.677006 5.023556 8.523011 1.0 8.523011 0.846620 2.839623 301.00 3607.998150 60.040344 589 20.484722 False False True
599 5t7JFDNbMKgT 0.388641 0.955166 4.725812 8.432525 1.0 8.432525 0.955052 2.542373 300.00 4637.344983 56.924079 327 11.222778 False False True
482 4q4ctvIRr0ZR 0.687679 0.760989 4.598004 8.345674 1.0 8.345674 0.470645 9.709677 301.00 3892.035737 55.586408 1158 21.187222 False False True
207 2bICz2yS4YQz 0.552059 0.274787 4.864748 8.123968 1.0 8.123968 0.526253 7.228916 300.00 2509.848149 58.378217 1095 23.986389 False False True
131 223uDa5Ke2hV 0.672898 0.802181 4.354180 8.006350 1.0 8.006350 0.539201 1.714286 30.00 949.757311 53.034485 2367 21.339444 False False True
361 3hmcNb4X0gGJ 0.611792 0.440490 4.633551 8.002609 1.0 8.002609 1.186349 2.666667 300.00 2914.620493 55.958453 549 14.461389 False False True
256 2wOOQjytBs5k 1.088432 0.165606 4.480428 7.974679 1.0 7.974679 1.335022 1.220000 61.00 1686.822249 54.355826 2035 21.171111 False False True
356 3eeAlSDySqwp 1.218776 0.285790 4.200961 7.806008 1.0 7.806008 0.272422 26.353333 395.30 1513.464613 51.430856 875 13.383056 False False True
108 1pXAjoLACCV8 0.531891 0.581032 4.373368 7.672975 1.0 7.672975 0.520330 6.593407 300.00 3308.692878 53.235306 821 19.942222 False False True
536 5KMSlf784G2M 0.142101 0.671994 4.559414 7.653215 1.0 7.653215 0.667057 3.440476 144.50 3591.714967 55.182515 760 13.292222 False False True

Cinematics¶

Trajectory¶

In [26]:
# Convert GeoPandas to a Trajectory Collection using MovingPandas
traj_collection = mpd.TrajectoryCollection(gdf, traj_id_col=ID_COL, t=TIME_COL)
In [27]:
# Remove distance outliers 
MAX_LEN_KM = 2_000

filtered = [traj for traj in traj_collection if traj.get_length()/1000 <= MAX_LEN_KM]
traj_collection_filtered = mpd.TrajectoryCollection(filtered)

print(f"Kept {len(traj_collection_filtered)} of {len(traj_collection)} trajectories")
Kept 723 of 728 trajectories

Distance histogram¶

In [28]:
traj_lengths_km = pd.Series(
    [traj.get_length() / 1000 for traj in traj_collection_filtered.trajectories],
    name="trajectory_length_km",
)

bin_width = 100
max_len = np.ceil(traj_lengths_km.max() / bin_width) * bin_width
bins = np.arange(0, max_len + bin_width, bin_width)
bin_centers = 0.5 * (bins[1:] + bins[:-1])

fig, ax = plt.subplots(figsize=(10, 4))

# Cumulative share on left axis
counts_raw, _ = np.histogram(traj_lengths_km, bins=bins)
cumulative = counts_raw.cumsum() / counts_raw.sum()
ax.plot(bin_centers, cumulative, color="C1", marker="o", ms=3, label="Cumulative share")
ax.set_ylabel("Cumulative share")
ax.set_ylim(0, 1.15)
ax.yaxis.grid(True, linestyle="--", alpha=0.6)
ax.set_title("Trajectory lengths")

# Histogram on right axis
ax_count = ax.twinx()
counts, edges, patches = ax_count.hist(
    traj_lengths_km, bins=bins, color="C0", alpha=0.6, edgecolor="white"
)
ax_count.tick_params(labelright=False,right=False)
ax_count.grid(False)

# Annotate each bin with its count
for rect, count in zip(patches, counts):
    if count == 0:
        continue
    x = rect.get_x() + rect.get_width() / 2
    y = rect.get_height()
    ax_count.text(x, y, f"{int(count)}", ha="center", va="bottom", fontsize=8, color="black")

# Annotate each cumulative point (same color as the line)
for x, y in zip(bin_centers, cumulative):
    ax.text(x, y+.02, f"{y:.2f}", ha="center", va="bottom", fontsize=8, color="C1")


# X labels: end of each interval, centered on the bar
ax.set_xlabel("Bin upper end (km)")
ax.set_xticks(bin_centers)
ax.set_xticklabels([f"{edge:.0f}" for edge in edges[1:]], rotation=45, ha="right")

plt.tight_layout()
plt.show()
No description has been provided for this image

Long Distance Trajectories Map¶

In [29]:
# Remove short trips
MIN_LEN_KM = 50

filtered = [traj for traj in traj_collection_filtered if traj.get_length()/1000 >= MIN_LEN_KM]
traj_collection_longdistance = mpd.TrajectoryCollection(filtered)

print(f"Kept {len(traj_collection_longdistance)} of {len(traj_collection_filtered)} trajectories")
Kept 722 of 723 trajectories
In [30]:
traj_gdf = traj_collection_longdistance.to_traj_gdf().to_crs(epsg=4326)

traj_gdf.hvplot(
    geo=True,
    tiles="CartoLight",
    line_width=1,
    alpha=0.9,
    datashade=True,       # rasterize + shade by density
    aggregator='count',   # counts per pixel
    cmap='hot',
    hover=False,         # skip tooltips for speed
    width=900,
    height=600,
)
Out[30]: