Data Overview¶
Cargamos algunas de las librerias necesarias para nuestro análisis.
from __future__ import annotations
from pathlib import Path
import gc
import os
import base64
import hmac
import hashlib
from typing import Dict, Optional
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.transforms as transforms
import movingpandas as mpd
import numpy as np
import hvplot.pandas
from pyproj import Geod
#mpd.show_versions()
# Go to workspace root
def find_repo_root(start: Optional[Path] = None) -> Path:
start = start or Path.cwd()
markers = (".git", "pyproject.toml", "requirements.txt", "setup.cfg")
cur = start.resolve()
for parent in [cur, *cur.parents]:
if any((parent / m).exists() for m in markers):
return parent
return cur
Ingestion¶
Ingestamos los archivos que tenemos disponibles
# Configure Ingestion
ROOT = find_repo_root()
DATA_DIR = "data/raw"
DATA_PATH = ROOT / DATA_DIR
CSV_PATH = sorted(DATA_PATH.glob("*.csv"))[0] #using any of our files
# CSV Labels
ID_COL = "nombre"
TIME_COL = "fecha"
LAT_COL = "lat"
LON_COL = "lon"
SPEED_COL = "vel"
# Read CSV file
df = pd.read_csv(CSV_PATH)
print(f"Raw data points: {len(df)} in {CSV_PATH}")
print(f"Raw vehicles: {len(df[ID_COL].unique())}")
Raw data points: 14138871 in C:\fabro\Projects\eda-notebook-workspace\data\raw\datos_telematica_2025-09-02.csv
Raw vehicles: 20673
Ofuscación¶
Como nuestro Notebook estará publicado en un repositorio público, vamos a ofuscar las identidades. Generamos un Hash a partir de los dominios de los vehículos y una clave privada. Recortamos el token final a 12 caracteres (suficiente para el parque de vehíclos que venimos trabajando ~20k).
# Needed functions
# Load KEY=VALUE
def load_kv_file(path: Path) -> Dict[str, str]:
if not path.exists():
return {}
out: Dict[str, str] = {}
for raw in path.read_text(encoding="utf-8").splitlines():
line = raw.strip()
if not line or line.startswith("#"):
continue
if "=" not in line:
continue
k, v = line.split("=", 1)
k = k.strip()
v = v.strip().strip("'").strip('"')
out[k] = v
return out
# Get Key from file
def get_hmac_key(
env_var: str = "PLATE_HMAC_KEY_B64",
secrets_relpath: str = "config/secrets.env",
) -> bytes:
root = find_repo_root()
secrets_path = root / secrets_relpath
kv = load_kv_file(secrets_path)
b64 = kv.get(env_var)
if not b64:
raise RuntimeError(
f"No encontré {env_var}. Seteala en {secrets_relpath}."
)
try:
key = base64.b64decode(b64, validate=True)
except Exception as e:
raise ValueError(f"{env_var} no es base64 válido: {e}")
if len(key) < 16:
raise ValueError(f"{env_var} demasiado corta ({len(key)} bytes). Usá >= 16, ideal 32.")
return key
# HMAC -> Base62 -> token corto
_ALPH = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
def _b62_from_int(x: int) -> str:
if x == 0:
return _ALPH[0]
s = []
while x:
x, r = divmod(x, 62)
s.append(_ALPH[r])
return "".join(reversed(s))
def _bytes_needed_for_base62_len(out_len: int) -> int:
# bits ~= out_len * log2(62) ; log2(62) ≈ 5.954
bits = int((out_len * 5.954242509439324) + 7) # +7 para ceil/8
return max(8, (bits + 7) // 8) # mínimo 8 bytes (64b) por robustez
def plate_token_norm(norm_plate: str, key: bytes, out_len: int = 12) -> str:
msg = (norm_plate or "").encode("utf-8")
digest = hmac.new(key, msg, hashlib.sha256).digest()
nbytes = _bytes_needed_for_base62_len(out_len)
x = int.from_bytes(digest[:nbytes], "big")
token = _b62_from_int(x).rjust(out_len, _ALPH[0])
return token[:out_len]
def obfuscate_plate_column(df: pd.DataFrame, col: str, key: bytes, out_len: int = 12, out_col: str | None = None):
out_col = out_col or col
s = df[col].astype("string")
norm = (
s.fillna("")
.str.strip()
.str.upper()
.str.replace(" ", "", regex=False)
.str.replace("-", "", regex=False)
)
# Factorize
codes, uniques = pd.factorize(norm, sort=False)
# Unique tokens
tokens = np.array([plate_token_norm(u, key=key, out_len=out_len) for u in uniques], dtype=object)
# Replace column
df[out_col] = tokens[codes]
# Preserve NaN
df.loc[s.isna(), out_col] = pd.NA
return df
key = get_hmac_key()
df = obfuscate_plate_column(df, col=ID_COL, key=key, out_len=12)
del key
gc.collect()
72
Data Cleaning and Filtering¶
df.head()
| nombre | fecha | idtipo | idtipoexterno | segundosdemorarecepcion | lat | lon | vel | rumbo | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | nzb6PvZHUPEZ | 2025-09-02 02:46:41+00 | 0 | 1 | 4 | NaN | NaN | NaN | NaN |
| 1 | XuzAaa26JPjR | 2025-09-02 02:46:42+00 | 0 | 1 | 3 | -32.85436 | -61.34860 | 82.0 | 266.0 |
| 2 | OL7Nf8ksDelc | 2025-09-02 02:46:41+00 | 0 | 1 | 4 | -34.61745 | -60.40863 | 0.0 | NaN |
| 3 | zT7zfozMrWsX | 2025-09-02 02:46:40+00 | 0 | 1006 | 5 | -33.53368 | -60.10854 | 82.0 | 319.0 |
| 4 | TtPO38583MG3 | 2025-09-02 02:46:41+00 | 0 | 1 | 4 | -34.31298 | -60.23689 | 0.0 | NaN |
Basic Clean¶
Realizamos limpieza básica de los datos.
#Drop missing values
required_cols = [ID_COL, TIME_COL, LAT_COL, LON_COL, SPEED_COL]
df_clean = df.dropna(subset=required_cols).copy()
print(f"Removed {len(df) - len(df_clean)} rows with missing values in {required_cols}")
Removed 82908 rows with missing values in ['nombre', 'fecha', 'lat', 'lon', 'vel']
# Drop trajectories with fewer than two points
vehicles = df_clean[ID_COL].unique()
traj_sizes = df_clean.groupby(ID_COL).size()
valid_ids = traj_sizes[traj_sizes >= 2].index
df_clean = df_clean[df_clean[ID_COL].isin(valid_ids)]
print(f"Removed {len(vehicles) - len(valid_ids)} vehicles with <2 registries")
Removed 62 vehicles with <2 registries
Discard Home Ranged Vehicles¶
Descartamos vehículos que consideramos que están en un área operativa reducida o, posiblemente para el momento del análisis, dentro del home range si consideramos que todos realizan viajes de larga distancia.
# Define radius in km of the Home Range area
HOME_RANGE_THRESHOLD = 50
# Sort data by ID and Time
df_clean = df_clean.sort_values([ID_COL, TIME_COL])
# Aggregate data for vehicles
ends = df_clean.groupby(ID_COL).agg(
lon_start=(LON_COL, "first"),
lat_start=(LAT_COL, "first"),
lon_end=(LON_COL, "last"),
lat_end=(LAT_COL, "last"),
)
# Using pyproj to calculate distance using WGS84 projection
geod = Geod(ellps="WGS84")
_, _, dist_m = geod.inv(
ends["lon_start"].values,
ends["lat_start"].values,
ends["lon_end"].values,
ends["lat_end"].values,
)
ends["dist_km"] = dist_m / 1000.0
# Filter
ids_to_drop = ends.loc[ends["dist_km"] < HOME_RANGE_THRESHOLD].index
df_filtered = df_clean.loc[~df_clean[ID_COL].isin(ids_to_drop)].copy()
print(f"Dropped {len(ids_to_drop)} vehicles; remaining: {len(df_filtered[ID_COL].unique())}")
Dropped 13184 vehicles; remaining: 7373
Sampling¶
Tomamos una muestra aleatoria de los datos, utilizando el dominio del vehículo para filtrar.
PERCENT=0.10
# Sample vehicles and copy dataframe
vehicles = df_filtered[ID_COL].unique()
sampled_ids = pd.Series(vehicles).sample(frac=PERCENT, random_state=42)
df_sampled = df_filtered[df_filtered[ID_COL].isin(sampled_ids)].copy()
print(f"Total vehicles: {len(vehicles)} | Sampled: {len(sampled_ids)}")
Total vehicles: 7373 | Sampled: 737
Pre-Processing¶
En esta sección mejoramos/preparamos los datos para el análisis que vamos a realizar luego. No filtramos ni limpiamos sino que sacamos elementos que no serán necesarios para nuestro.
# Normalize timestamps to UTC
df_sampled[TIME_COL] = pd.to_datetime(df_sampled[TIME_COL], utc=True).dt.tz_localize(None)
# Trim 0 velocity ends of each vehicle
df_sampled = df_sampled.sort_values([ID_COL, TIME_COL])
df_sampled["date"] = df_sampled[TIME_COL].dt.date
group_cols = [ID_COL, "date"]
value_cols = df_sampled.columns.difference(group_cols)
def trim_zero_edges(g):
nz_idx = g.index[g[SPEED_COL].fillna(0) != 0]
if nz_idx.empty:
return g.iloc[0:0] # no movement that day -> drop all rows
first, last = nz_idx[0], nz_idx[-1]
return g.loc[first:last]
df_trimmed = (
df_sampled.groupby(group_cols, group_keys=True)[value_cols]
.apply(trim_zero_edges)
.reset_index(level=[0, 1]) # bring group keys back as columns
.reset_index(drop=True)
)
print(f"Trimmed {len(df_sampled) - len(df_trimmed)} rows")
Trimmed 65729 rows
# Convert to GeoPandas and use EPSG 3857 (UTM) projection
gdf = gpd.GeoDataFrame(
df_trimmed,
geometry=gpd.points_from_xy(df_trimmed[LON_COL], df_trimmed[LAT_COL], crs="EPSG:4326").to_crs("EPSG:3857"),
)
Inspect¶
Código auxiliar para inspeccionar datos.
Trajectory¶
DOMINIO="3UEGwOiggnWt"
g = gdf.loc[gdf[ID_COL] == DOMINIO, [ID_COL, TIME_COL, "geometry"]].copy()
if g.empty:
raise ValueError(f"No se encontraron registros para ID_COL={DOMINIO}")
g = g.sort_values(TIME_COL).set_index(TIME_COL)
traj = mpd.Trajectory(g, traj_id=DOMINIO)
traj_gdf = traj.to_traj_gdf().to_crs(epsg=4326)
traj_gdf.hvplot(
geo=True,
tiles="CartoLight",
rasterize=True, # similar to datashade, hover on pixels
line_width=1,
alpha=0.9,
#hover=False, # skip tooltips for speed
width=900,
height=600,
)
Analysis¶
Esta sección contiene cada una de las lineas de análisis de nuestros datos.
Sampling Quality¶
¿Qué busco con Sampling Quality? Analizar, describir y desarrollar métricas orientadas a eliminar el ruido de los datos que voy a usar para futuros modelos. No evaluamos calidad de la observación generada (GPS o cinemómetro) sino cantidad, frecuencia temporal y/o espacial, y otros atributos que podamos derivar.
Run Statistics¶
# Build trajectories
tc = mpd.TrajectoryCollection(gdf, traj_id_col=ID_COL, t=TIME_COL)
# Per-trajectory gap and speed stats (meters, m/s)
records = []
for traj in tc:
time_series = (
traj.df["timestamp"]
if "timestamp" in traj.df.columns
else traj.df.index.to_series()
)
time_series = pd.to_datetime(time_series)
gaps = time_series.diff().dt.total_seconds()
traj.add_distance()
traj.add_speed()
n_points = len(traj.df)
coverage_s = (time_series.max() - time_series.min()).total_seconds()
# Main measures
records.append({
"id": traj.id,
"n_points": n_points,
"coverage_s" : coverage_s,
"median_gap_s": gaps.median(),
"p95_gap_s": gaps.quantile(0.95),
"median_dist_m": traj.df["distance"].median(),
"p95_dist_m": traj.df["distance"].quantile(0.95),
"median_speed_mps": traj.df["speed"].median(),
"p95_speed_mps": traj.df["speed"].quantile(0.95),
"pct_speed_over_50kph": (traj.df["speed"] > 50/3.6).mean(),
})
stats = pd.DataFrame(records)
# Basics
stats["coverage_hours"] = stats["coverage_s"] / 3600.0
stats["pts_per_hour"] = stats["n_points"] / stats["coverage_hours"]
# Expected points and Sampling Efficiency
stats["expected_points_median_gap"] = stats["coverage_s"] / stats["median_gap_s"]
stats["sampling_efficiency"] = stats["n_points"] / stats["expected_points_median_gap"] # ~1 si consistente
# Consistency between distance/gap vs speed
stats["median_speed_kph"] = stats["median_speed_mps"] * 3.6
stats["p95_speed_kph"] = stats["p95_speed_mps"] * 3.6
stats["implied_median_speed_kph"] = (stats["median_dist_m"] / (stats["median_gap_s"])) * 3.6
stats["implied_p95_speed_kph"] = (stats["p95_dist_m"] / (stats["p95_gap_s"])) * 3.6
stats["delta_median_speed_kph"] = stats["implied_median_speed_kph"] - stats["median_speed_kph"]
stats["delta_p95_speed_kph"] = stats["implied_p95_speed_kph"] - stats["p95_speed_kph"]
# Tail / Median sampling time gap ratio
stats["gap_tail_ratio"] = stats["p95_gap_s"] / stats["median_gap_s"]
Plotting + Description¶
Ejecutamos algunas gráficas para visualizar la dispersión de los datos en base a algunas medidas propuestas.
# Left Plot
x = stats["coverage_hours"].to_numpy()
y = stats["n_points"].to_numpy()
# Run a linear regression
a, b = np.polyfit(x, y, 1)
# Fit data to graph our function
x_fit = np.linspace(x.min(), x.max(), 200)
y_fit = a * x_fit + b
# R^2
y_pred = a * x + b
ss_res = np.sum((y - y_pred) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2 = 1 - ss_res / ss_tot if ss_tot > 0 else np.nan
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
ax1.scatter(x, y, c="cadetblue",alpha=0.7)
ax1.plot(x_fit, y_fit, color="coral", linewidth=2)
ax1.set_yscale("log")
ax1.set_xlabel("coverage_hours")
ax1.set_ylabel("n_points (log)")
ax1.set_title("Coverage vs Num. of Points")
ax1.grid(True, alpha=0.3)
eq = f"log(y) = log({a:.4f}·x + {b:.4f})\nR² = {r2:.4f}"
ax1.text(
0.025, 0.97, eq,
transform=ax1.transAxes,
va="top", ha="left",
color="crimson",
bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)
# Right Plot
mean_x = stats["median_gap_s"].mean()
mean_y = stats["pts_per_hour"].mean()
tx = transforms.blended_transform_factory(ax2.transData, ax2.transAxes) # x=data, y=axes
ty = transforms.blended_transform_factory(ax2.transAxes, ax2.transData) # x=axes, y=data
ax2.scatter(stats["median_gap_s"], stats["pts_per_hour"], alpha=.7)
ax2.set_xscale("log"); ax2.set_yscale("log")
ax2.set_xlabel("median_gap_s (log)")
ax2.set_ylabel("pts_per_hour (log)")
ax2.axvline(mean_x, color="coral", linestyle=":", linewidth=2)
ax2.axhline(mean_y, color="coral", linestyle=":", linewidth=2)
ax2.set_title("Sampling Gap vs Point Density")
ax2.grid(True, alpha=.3)
ax2.text(
mean_x, 0.98, f"μx = {mean_x:.2f}",
transform=tx, rotation=90,
va="top", ha="right", color="red",
bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)
ax2.text(
0.98, mean_y, f"μy = {mean_y:.2f}",
transform=ty,
va="top", ha="right", color="red",
bbox=dict(facecolor="white", alpha=0.8, edgecolor="crimson")
)
plt.tight_layout()
plt.show()
Coverage vs Num. of Points:
- Cobertura baja + pocos puntos => “vehículo con poca actividad / ventana corta”.
- Cobertura alta + pocos puntos => mal muestreo.
- Regresión y R^2 = Baja relación lineal entre cobertura y número de puntos.
- ¿Una flota uniforme debería ajustarse mejor?
Sampling Gap vs Point Density:
- ¿Qué es lo que buscamos? Outliers con median_gap bajo y peropts_per_hour bajo.
- Puede implicar gap mal estimado o timestamps raros.
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# Left Plot
x = stats["sampling_efficiency"]
# Trim tails (1% - 99%)
lo, hi = x.quantile([0.01, 0.99])
x_plot = x.clip(lo, hi)
ax1.hist(x_plot, bins=50, color="darkseagreen", alpha=0.8, rwidth=0.92)
ax1.axvline(1.0, linewidth=1, color="crimson")
ax1.axvline(x.median(), linestyle="--", linewidth=2)
ax1.axvline(x.quantile(0.95), linestyle=":", linewidth=2)
ax1.set_xlabel("sampling_efficiency")
ax1.set_title("Sampling Efficiency")
ax1.grid(True, alpha=.25)
ax1.set_axisbelow(True)
ax1.text(
0.02, 0.97,
f"median={x.median():.2f}\nP95={x.quantile(0.95):.2f}\nP99={x.quantile(0.99):.2f}",
transform=ax1.transAxes, va="top", ha="left",
bbox=dict(facecolor="white", alpha=0.85, edgecolor="0.7")
)
# Right Plot
g = stats["gap_tail_ratio"]
g_log = np.log10(g)
# Trim tails (1% - 99%)
lo, hi = g_log.quantile([0.01, 0.99])
g_plot = g_log.clip(lo, hi)
ax2.hist(g_plot, bins=40, color="mediumseagreen", alpha=0.8, rwidth=0.92)
ax2.axvline(0.0, linewidth=1, color="crimson")
ax2.axvline(g_log.median(), linestyle="--", linewidth=2)
ax2.axvline(g_log.quantile(0.95), linestyle=":", linewidth=2)
ax2.set_xlabel("log10(p95_gap / median_gap)")
ax2.set_title("Sample Irregularity (Tail ratio)")
ax2.grid(True, alpha=.25)
ax2.set_axisbelow(True)
ax2.text(
0.83, 0.97,
f"median={g_log.median():.2f}\nP95={g_log.quantile(0.95):.2f}\nP99={g_log.quantile(0.99):.2f}",
transform=ax2.transAxes, va="top", ha="left",
bbox=dict(facecolor="white", alpha=0.85, edgecolor="0.7")
)
plt.tight_layout()
plt.show()
Sampling Efficiency (Ideal 1)
- 1: el numero de puntos esperados (tiempo/media de gap) es el número de puntos que tenemos.
- <1: faltan puntos respecto a lo que sugiere el gap.
- >1: duplicados, bursts, o median_gap subestima.
Sample Irregularity (Tail ratio) (Ideal 0)
- Percentil 95 de gaps / Media de gap
- Ratio alto = Corte de la regularidad (¿faltan muestras?)
Outliers acá suelen ser saltos GPS o timestamps con problemas.
cols = ["median_speed_kph", "implied_median_speed_kph", "gap_tail_ratio","sampling_efficiency","delta_median_speed_kph"]
df = stats[cols]
# Color setup
c = np.log10(df["gap_tail_ratio"])
c_lo, c_hi = np.nanquantile(c, [0.01, 0.99])
c_plot = np.clip(c, c_lo, c_hi)
# Identity scatter limits
lim = np.nanmax([df["median_speed_kph"].max(), df["implied_median_speed_kph"].max()])
lims = (0, lim)
# Tolerance
tol1 = 0.10 # ±10%
tol2 = 0.25 # ±25%
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# =========================
# (1) Consistency
# =========================
x_band = np.linspace(max(lims[0], 0), lims[1], 300)
# ±10% Zone
y_low1 = (1 - tol1) * x_band
y_high1 = (1 + tol1) * x_band
ax1.fill_between(x_band, y_low1, y_high1, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
# ±25% Zone
y_low2 = (1 - tol2) * x_band
y_high2 = (1 + tol2) * x_band
ax1.fill_between(x_band, y_low2, y_low1, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax1.fill_between(x_band, y_high1, y_high2, color="0.92", alpha=0.9)
# Identity
ax1.plot(x_band, x_band, color="0.6", linewidth=1)
sc1 = ax1.scatter(df["median_speed_kph"], df["implied_median_speed_kph"], c=c_plot, alpha=0.6, s=30)
ax1.set_xlim(lims); ax1.set_ylim(lims)
ax1.set_aspect("equal", adjustable="box")
ax1.set_xlabel("median_speed_kph (reported)")
ax1.set_ylabel("median_speed_kph (implicit dist/gap)")
ax1.set_title("Speed Consistency")
ax1.grid(True, alpha=0.25)
ax1.legend(loc="upper left", framealpha=0.9)
# =========================
# (2) Residuals
# =========================
x_band2 = np.linspace(df["median_speed_kph"].min(), df["median_speed_kph"].max(), 300)
ax2.fill_between(x_band2, -tol1*x_band2, tol1*x_band2, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
ax2.fill_between(x_band2, -tol2*x_band2, -tol1*x_band2, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax2.fill_between(x_band2, tol1*x_band2, tol2*x_band2, color="0.92", alpha=0.9)
ax2.axhline(0, color="0.6", linewidth=1)
sc2 = ax2.scatter(df["median_speed_kph"], df["delta_median_speed_kph"], c=c_plot, alpha=0.6, s=30)
ax2.set_xlabel("median_speed_kph (reported)")
ax2.set_ylabel("delta_kph (implicit - reported)")
ax2.set_title("Residuals of Implied vs Reported")
ax2.grid(True, alpha=0.25)
ax2.legend(loc="lower right", framealpha=0.9)
cb2 = plt.colorbar(sc2, ax=ax2)
cb2.set_label("log10(gap_tail_ratio)")
fig.suptitle(
"Speed Analysis (Color by gap_tail_ratio)",
fontsize=14
)
plt.tight_layout()
plt.show()
cols = ["median_speed_kph", "implied_median_speed_kph", "gap_tail_ratio","sampling_efficiency","delta_median_speed_kph"]
df = stats[cols]
# Color setup
eff = df["sampling_efficiency"].to_numpy()
vmin = np.quantile(eff, 0.05)
vmax = np.quantile(eff, 0.99)
vcenter = np.mean(eff)
vcenter = np.clip(vcenter, vmin, vmax)
norm = mcolors.TwoSlopeNorm(vmin=vmin, vcenter=vcenter, vmax=vmax)
eff_clip = np.clip(eff, vmin, vmax)
# Tolerance
tol1 = 0.10 # ±10%
tol2 = 0.25 # ±25%
# Identity scatter limits
lim = np.nanmax([df["median_speed_kph"].max(), df["implied_median_speed_kph"].max()])
lims = (0, lim)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# =========================
# (1) Consistency
# =========================
x_band = np.linspace(max(lims[0], 0), lims[1], 300)
# ±10% Zone
y_low1 = (1 - tol1) * x_band
y_high1 = (1 + tol1) * x_band
ax1.fill_between(x_band, y_low1, y_high1, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
# ±25% Zone
y_low2 = (1 - tol2) * x_band
y_high2 = (1 + tol2) * x_band
ax1.fill_between(x_band, y_low2, y_low1, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax1.fill_between(x_band, y_high1, y_high2, color="0.92", alpha=0.9)
# Identity
ax1.plot(x_band, x_band, color="0.6", linewidth=1)
sc1 = ax1.scatter(
df["median_speed_kph"], df["implied_median_speed_kph"],
c=eff_clip, norm=norm, cmap="coolwarm_r",
alpha=0.65, s=30
)
ax1.set_xlim(lims); ax1.set_ylim(lims)
ax1.set_aspect("equal", adjustable="box")
ax1.set_xlabel("median_speed_kph (reported)")
ax1.set_ylabel("median_speed_kph (implicit dist/gap)")
ax1.set_title("Speed Consistency")
ax1.grid(True, alpha=0.25)
ax1.set_axisbelow(True)
ax1.legend(loc="lower right", framealpha=0.9)
# =========================
# (2) Residuals
# =========================
x_band2 = np.linspace(df["median_speed_kph"].min(), df["median_speed_kph"].max(), 300)
ax2.fill_between(x_band2, -tol1*x_band2, tol1*x_band2, color="0.85", alpha=0.8, label=f"±{int(tol1*100)}%")
ax2.fill_between(x_band2, -tol2*x_band2, -tol1*x_band2, color="0.92", alpha=0.9, label=f"±{int(tol2*100)}%")
ax2.fill_between(x_band2, tol1*x_band2, tol2*x_band2, color="0.92", alpha=0.9)
ax2.axhline(0, color="0.6", linewidth=1)
sc2 = ax2.scatter(
df["median_speed_kph"], df["delta_median_speed_kph"],
c=eff_clip, norm=norm, cmap="coolwarm_r",
alpha=0.65, s=30
)
ax2.set_xlabel("median_speed_kph (reported)")
ax2.set_ylabel("delta_kph (implicit - reported)")
ax2.set_title("Residuals of Implied vs Reported")
ax2.grid(True, alpha=0.25)
ax2.set_axisbelow(True)
ax2.legend(loc="upper left", framealpha=0.9)
cb2 = plt.colorbar(sc2, ax=ax2)
cb2.set_label("sampling_efficiency")
cb2.set_ticks([vmin, vcenter, vmax])
fig.suptitle(
"Speed Analysis (Color by sampling_efficiency)",
fontsize=14
)
plt.tight_layout()
plt.show()
Algunas cosas que observamos:
- Concentración cerca de la diagonal (zona ±10%), especialmente en el rango medio–alto de velocidades.
- La velocidad implícita suele subestimar la reportada, y el efecto se acentúa a velocidades mayores.
- El error crece con la velocidad.
- gap_tail_ratio se alinea bien con los problemas de consistencia (la velocidad reportada es menor que la implicita)
- sampling_efficiency evidencia que hay menos puntos que los esperables en todo el rango de velocidades.
Algunas cosas para investigar a futuro:
- Segmentación de recorridos.
- Detección de paradas.
- Generalizaciones para zonas sin datos.
cols = ["id","median_speed_kph","implied_median_speed_kph","gap_tail_ratio","sampling_efficiency","n_points","coverage_hours","delta_median_speed_kph"]
df = stats[cols]
# Definition “out of band”
# For low speed we set a minimum threshold.
MIN_SPEED = 10
df_eval = df[df["median_speed_kph"] >= MIN_SPEED].copy()
df_eval["abs_delta_kph"] = df_eval["delta_median_speed_kph"].abs()
df_eval["abs_rel_err"] = df_eval["abs_delta_kph"] / df_eval["median_speed_kph"].clip(lower=1e-6)
# Tolerance
tol10, tol25 = 0.10, 0.25
df_eval["out_10pct"] = df_eval["abs_delta_kph"] > (tol10 * df_eval["median_speed_kph"])
df_eval["out_25pct"] = df_eval["abs_delta_kph"] > (tol25 * df_eval["median_speed_kph"])
# Speed Bins
bins = [MIN_SPEED, 20, 40, 60, 80, 100, 120, 160, np.inf]
labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
df_eval["speed_bin"] = pd.cut(df_eval["median_speed_kph"], bins=bins, labels=labels, right=False)
summary = (
df_eval.groupby("speed_bin", observed=False)
.agg(
n=("id","count"),
out_10pct_rate=("out_10pct", "mean"),
out_25pct_rate=("out_25pct", "mean"),
median_abs_delta=("abs_delta_kph","median"),
p95_abs_delta=("abs_delta_kph", lambda s: s.quantile(0.95)),
median_gap_tail_ratio=("gap_tail_ratio","median"),
median_sampling_eff=("sampling_efficiency","median"),
)
.reset_index()
)
# Round rates
summary["out_10pct_rate"] = (summary["out_10pct_rate"]).round(4)
summary["out_25pct_rate"] = (summary["out_25pct_rate"]).round(4)
summary
| speed_bin | n | out_10pct_rate | out_25pct_rate | median_abs_delta | p95_abs_delta | median_gap_tail_ratio | median_sampling_eff | |
|---|---|---|---|---|---|---|---|---|
| 0 | 10-20 | 66 | 0.9091 | 0.8636 | 9.917205 | 14.965177 | 4.730206 | 0.641597 |
| 1 | 20-40 | 112 | 0.8929 | 0.8214 | 15.579568 | 27.303946 | 4.078311 | 0.576172 |
| 2 | 40-60 | 80 | 0.8000 | 0.6000 | 16.188403 | 35.582364 | 3.457812 | 0.554763 |
| 3 | 60-80 | 103 | 0.6699 | 0.5049 | 17.111441 | 53.443365 | 2.542373 | 0.667343 |
| 4 | 80-100 | 202 | 0.3465 | 0.2277 | 2.701474 | 59.466536 | 1.896023 | 0.729820 |
| 5 | 100-120 | 48 | 0.3125 | 0.2292 | 1.865364 | 60.945672 | 1.355311 | 0.699619 |
| 6 | 120-160 | 5 | 0.6000 | 0.2000 | 12.268467 | 33.681922 | 1.016667 | 0.721329 |
| 7 | 160-inf | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
Scoring¶
Seleccionamos Top N vehículos en función de una métrica de interés para empezar a entender nuestros outliers.
SCORING_BY = "abs_rel_err"
top = (
df_eval.assign(score=df_eval[SCORING_BY])
.sort_values("score", ascending=False)
.head(20)
.copy()
)
cols_show = [
"id",
"median_speed_kph",
"implied_median_speed_kph",
"delta_median_speed_kph",
"abs_delta_kph",
"abs_rel_err",
"gap_tail_ratio",
"sampling_efficiency",
"n_points",
"coverage_hours",
]
top[cols_show].assign(
abs_rel_err=lambda d: (d["abs_rel_err"]).round(3), # rate error
delta_median_speed_kph=lambda d: d["delta_median_speed_kph"].round(1),
abs_delta_kph=lambda d: d["abs_delta_kph"].round(1),
median_speed_kph=lambda d: d["median_speed_kph"].round(1),
implied_median_speed_kph=lambda d: d["implied_median_speed_kph"].round(1),
gap_tail_ratio=lambda d: d["gap_tail_ratio"].round(2),
sampling_efficiency=lambda d: d["sampling_efficiency"].round(2),
coverage_hours=lambda d: d["coverage_hours"].round(2),
).rename(columns={"abs_rel_err":"abs_rel_err_rate"})
| id | median_speed_kph | implied_median_speed_kph | delta_median_speed_kph | abs_delta_kph | abs_rel_err_rate | gap_tail_ratio | sampling_efficiency | n_points | coverage_hours | |
|---|---|---|---|---|---|---|---|---|---|---|
| 333 | 3UEGwOiggnWt | 20.6 | 43.0 | 22.4 | 22.4 | 1.088 | 34.00 | 0.05 | 3652 | 20.39 |
| 666 | Y1qBOymx8YGb | 28.9 | 59.3 | 30.4 | 30.4 | 1.052 | 30.00 | 0.06 | 2143 | 10.67 |
| 425 | 4MB0oH5pe80M | 23.7 | 1.0 | -22.7 | 22.7 | 0.957 | 6.58 | 0.56 | 164 | 14.62 |
| 591 | 5oPliRq1UI1v | 12.6 | 0.6 | -12.0 | 12.0 | 0.952 | 2.29 | 1.07 | 783 | 21.28 |
| 188 | 2S4buidKVaeo | 10.4 | 0.5 | -9.9 | 9.9 | 0.950 | 2.52 | 1.05 | 501 | 15.98 |
| 199 | 2UxsOtbEaYvc | 10.4 | 0.5 | -9.8 | 9.8 | 0.947 | 2.67 | 1.01 | 434 | 13.55 |
| 198 | 2UNu3nAvubkL | 20.0 | 1.1 | -18.9 | 18.9 | 0.944 | 3.22 | 0.90 | 395 | 11.40 |
| 576 | 5edKs4Tl9QiP | 13.4 | 0.8 | -12.5 | 12.5 | 0.938 | 2.66 | 0.99 | 372 | 11.79 |
| 95 | 1lGQ1tCRHXrn | 11.0 | 0.7 | -10.3 | 10.3 | 0.933 | 5.01 | 0.56 | 261 | 23.42 |
| 46 | 1Kyg7cHK3Mpi | 12.5 | 0.9 | -11.6 | 11.6 | 0.930 | 5.36 | 0.66 | 294 | 20.90 |
| 73 | 1YoE4zWDpLth | 20.8 | 1.5 | -19.4 | 19.4 | 0.929 | 2.77 | 1.14 | 805 | 21.23 |
| 91 | 1kENfr6FX4UD | 32.4 | 2.7 | -29.6 | 29.6 | 0.915 | 2.96 | 0.76 | 1017 | 14.39 |
| 109 | 1qBbPK8P73tB | 23.4 | 2.0 | -21.4 | 21.4 | 0.913 | 4.97 | 0.68 | 287 | 21.14 |
| 343 | 3ZZrHcZzX2eI | 19.5 | 1.9 | -17.7 | 17.7 | 0.904 | 2.34 | 0.93 | 624 | 23.97 |
| 495 | 4zQiEZ6eO8jr | 10.5 | 1.0 | -9.4 | 9.4 | 0.902 | 4.95 | 0.65 | 799 | 20.95 |
| 391 | 43m3itVV18e6 | 26.7 | 2.6 | -24.1 | 24.1 | 0.901 | 1.02 | 1.52 | 655 | 14.19 |
| 471 | 4kiueP9NP0EV | 17.0 | 1.7 | -15.3 | 15.3 | 0.901 | 1.07 | 1.15 | 1457 | 21.12 |
| 724 | xkvxf4FjNsv9 | 14.2 | 1.4 | -12.7 | 12.7 | 0.900 | 4.56 | 0.65 | 542 | 15.38 |
| 631 | GZcaYHMAWfYq | 35.5 | 3.6 | -31.9 | 31.9 | 0.898 | 2.70 | 0.93 | 676 | 17.95 |
| 256 | 2wOOQjytBs5k | 60.7 | 6.3 | -54.4 | 54.4 | 0.896 | 1.22 | 1.34 | 2035 | 21.17 |
Proponemos otra estrategia de detección de outliers mediante Outlier Scoring. Para ello vamos a utilizar una ponderación de las siguientes medidas:
Sampling Score
- Promedio de tres señales robustas.
- Mide anomalía de muestreo (cortes + irregularidad + eficiencia).
- Severidad de cortes: p95_gap_s alto (en log)
- Intermitencia: gap_tail_ratio = p95_gap_s / median_gap_s alto (en log)
- Eficiencia de muestreo: sampling_efficiency lejos del valor típico (ideal ≈ 1, pero se mide como outlier robusto)
GPS Score
- Mide anomalía espacial compatible con saltos GPS, errores de geolocalización o puntos espurios.
- Severidad robusta del tamaño de paso extremo usando p95_dist_m en log.
- Valores altos indican que el vehículo presenta distancias entre puntos anormalmente grandes en el percentil alto, lo cual suele asociarse a saltos de posición, mala señal o glitches.
Consistency score
- Severidad robusta del error típico.
- Mide la inconsistencia entre velocidad reportada y velocidad implícita calculada.
- Valores altos pueden indicar complejidad para modelar el movimiento (error de distancias, muestreo irregular).
Nota: Usamos el estadístico z_robusto aplicado a las medidas, dicha herramienta es frecuentemente utilizada como método de detección de outliers en textos de EDA.
To detect univariate outliers robustly, we used the modified z‑score based on the median absolute deviation (MAD). Following standard practice in robust statistics (Iglewicz & Hoaglin), observations with |M_i| > 3.5 were considered potential outliers. The constant 0.6745 scales MAD to be consistent with the standard deviation under normality.
def robust_z(x: pd.Series) -> pd.Series:
x = x.astype(float)
med = np.nanmedian(x)
mad = np.nanmedian(np.abs(x - med)) + 1e-9
return 0.6745 * (x - med) / mad
def safe_log10(s: pd.Series) -> pd.Series:
s = s.astype(float)
return np.log10(s.where(s > 0))
def clip_abs(s: pd.Series, cap: float = 6.0) -> pd.Series:
"""Capea para que un solo feature no domine todo el score."""
return s.abs().clip(upper=cap)
# -------------------------
# Ensure required columns
# -------------------------
required = ["id","p95_gap_s","gap_tail_ratio","p95_dist_m","sampling_efficiency","n_points","coverage_hours",
"median_speed_kph","implied_median_speed_kph", "delta_median_speed_kph"]
df = stats[required].copy()
df["abs_delta_median_speed_kph"] = df["delta_median_speed_kph"].abs()
# -------------------------
# Robust z-scores
# -------------------------
z = pd.DataFrame(index=df.index)
# Sampling irregularity / gaps (log)
z["z_log_p95_gap_s"] = robust_z(safe_log10(df["p95_gap_s"]))
z["z_log_gap_tail_ratio"] = robust_z(safe_log10(df["gap_tail_ratio"]))
# Sampling efficiency (sin log; buscamos extremos respecto del fleet)
z["z_sampling_eff"] = robust_z(df["sampling_efficiency"])
# GPS jumps proxy (log dist)
z["z_log_p95_dist_m"] = robust_z(safe_log10(df["p95_dist_m"]))
# Consistency (error en velocidad)
z["z_abs_delta_speed"] = robust_z(df["abs_delta_median_speed_kph"])
# -------------------------
# Component scores (cap + average)
# -------------------------
CAP = 6.0
sampling_score = (
clip_abs(z["z_log_p95_gap_s"], CAP) +
clip_abs(z["z_log_gap_tail_ratio"], CAP) +
clip_abs(z["z_sampling_eff"], CAP)
) / 3.0
gps_score = clip_abs(z["z_log_p95_dist_m"], CAP)
consistency_score = clip_abs(z["z_abs_delta_speed"], CAP)
# -------------------------
# Reliability weight (para no sobrerreaccionar con pocos datos)
# -------------------------
# Ajustá estos umbrales a tu caso (son razonables para 1 día de telemetría)
MIN_POINTS_FOR_FULL = 200
MIN_COV_H_FOR_FULL = 4.0
w_points = np.clip(np.log10(df["n_points"].clip(lower=2)) / np.log10(MIN_POINTS_FOR_FULL), 0, 1)
w_cov = np.clip(df["coverage_hours"] / MIN_COV_H_FOR_FULL, 0, 1)
reliability_w = (w_points * w_cov).fillna(0)
# -------------------------
# Total outlier score (pesos por familia)
# -------------------------
W_SAMPLING = 1.0
W_GPS = 1.0
W_CONS = 1.5 # suele ser más “grave” para tu diagnóstico de consistencia
raw_score = (W_SAMPLING * sampling_score) + (W_GPS * gps_score) + (W_CONS * consistency_score)
outlier_score = raw_score * reliability_w
# -------------------------
# Output table + flags
# -------------------------
out = pd.DataFrame({
"id": df["id"],
"sampling_score": sampling_score,
"gps_score": gps_score,
"consistency_score": consistency_score,
"raw_score": raw_score,
"reliability_w": reliability_w,
"outlier_score": outlier_score,
"sampling_efficiency": df["sampling_efficiency"],
"gap_tail_ratio": df["gap_tail_ratio"],
"p95_gap_s": df["p95_gap_s"],
"p95_dist_m": df["p95_dist_m"],
"abs_delta_median_speed_kph": df["abs_delta_median_speed_kph"],
"n_points": df["n_points"],
"coverage_hours": df["coverage_hours"],
})
# flags MAD-z > 3.5 (regla práctica)
TH = 3.5
out["flag_sampling"] = (z["z_log_p95_gap_s"].abs() > TH) | (z["z_log_gap_tail_ratio"].abs() > TH) | (z["z_sampling_eff"].abs() > TH)
out["flag_gps"] = (z["z_log_p95_dist_m"].abs() > TH)
out["flag_consistency"] = (z["z_abs_delta_speed"].abs() > TH)
# Top N outliers
out.sort_values("outlier_score", ascending=False).head(25)
| id | sampling_score | gps_score | consistency_score | raw_score | reliability_w | outlier_score | sampling_efficiency | gap_tail_ratio | p95_gap_s | p95_dist_m | abs_delta_median_speed_kph | n_points | coverage_hours | flag_sampling | flag_gps | flag_consistency | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 181 | 2Nrtauu0kF4Y | 0.466861 | 1.277099 | 6.000000 | 10.743960 | 1.0 | 10.743960 | 0.701068 | 4.448529 | 605.00 | 6200.494278 | 72.287795 | 392 | 21.123333 | False | False | True |
| 688 | jAESbgsgb8vp | 1.503399 | 0.206251 | 6.000000 | 10.709650 | 1.0 | 10.709650 | 0.200353 | 40.000000 | 600.00 | 1626.077933 | 74.032635 | 931 | 19.361667 | False | False | True |
| 618 | 6n6r55ztR50L | 0.289181 | 1.127169 | 6.000000 | 10.416350 | 1.0 | 10.416350 | 0.745604 | 3.703704 | 300.00 | 5415.927354 | 72.533799 | 702 | 21.184167 | False | False | True |
| 376 | 3wsd1LVYPxrr | 0.575109 | 0.664936 | 6.000000 | 10.240045 | 1.0 | 10.240045 | 0.616529 | 1.875000 | 30.00 | 1074.967276 | 73.062759 | 1367 | 9.854444 | False | False | True |
| 372 | 3qfQAgdkd10D | 0.652426 | 0.303406 | 6.000000 | 9.955832 | 1.0 | 9.955832 | 0.356332 | 4.991667 | 89.85 | 1489.598119 | 76.256423 | 1533 | 21.510833 | False | False | True |
| 566 | 5ZSBX8WIm5fg | 0.496187 | 1.031908 | 5.536344 | 9.832612 | 1.0 | 9.832612 | 1.037190 | 2.281369 | 300.00 | 4969.841005 | 65.407326 | 581 | 20.461667 | False | False | True |
| 160 | 2CwyANdzkUrA | 0.614027 | 0.122079 | 6.000000 | 9.736106 | 1.0 | 9.736106 | 0.954160 | 1.848485 | 61.00 | 2186.780331 | 73.540335 | 890 | 8.550278 | False | False | True |
| 403 | 49G3rMQ9HZGo | 0.380000 | 0.627262 | 5.802354 | 9.710793 | 1.0 | 9.710793 | 0.776722 | 4.615385 | 300.00 | 3449.632403 | 68.191449 | 816 | 18.968611 | False | False | True |
| 144 | 26JvjyZhrZHq | 0.358238 | 0.971186 | 5.360266 | 9.369823 | 1.0 | 9.369823 | 0.918966 | 2.510460 | 300.00 | 4704.867883 | 63.564446 | 587 | 21.203333 | False | False | True |
| 200 | 2VQjbvCWyNSQ | 0.253590 | 0.860352 | 5.378720 | 9.182022 | 1.0 | 9.182022 | 0.595578 | 3.485714 | 122.00 | 4257.107601 | 63.757589 | 698 | 11.394167 | False | False | True |
| 665 | Xejt5XDWB68Z | 0.842640 | 0.069291 | 5.454461 | 9.093623 | 1.0 | 9.093623 | 0.284173 | 5.545455 | 61.00 | 1839.977427 | 64.550316 | 1209 | 12.999722 | False | False | True |
| 467 | 4iNWUPtXU3IH | 0.469167 | 0.942165 | 5.108229 | 9.073675 | 1.0 | 9.073675 | 1.000646 | 2.222222 | 300.00 | 4583.262428 | 60.926558 | 436 | 16.339444 | False | False | True |
| 272 | 34P0C6N0jwsv | 0.487800 | 0.092410 | 5.608647 | 8.993181 | 1.0 | 8.993181 | 0.482971 | 3.000000 | 60.00 | 1801.991926 | 66.164067 | 1801 | 20.716667 | False | False | True |
| 143 | 267wFnyIxqAn | 0.997715 | 1.251219 | 4.470444 | 8.954600 | 1.0 | 8.954600 | 0.910156 | 1.052632 | 20.00 | 633.354666 | 54.251332 | 2539 | 14.723056 | False | False | True |
| 56 | 1NodC6MiUG6A | 0.292117 | 1.111426 | 4.974204 | 8.864849 | 1.0 | 8.864849 | 0.776739 | 3.370787 | 300.00 | 5339.538726 | 59.523816 | 490 | 15.595833 | False | False | True |
| 588 | 5lV9qKye5C2I | 0.310671 | 0.677006 | 5.023556 | 8.523011 | 1.0 | 8.523011 | 0.846620 | 2.839623 | 301.00 | 3607.998150 | 60.040344 | 589 | 20.484722 | False | False | True |
| 599 | 5t7JFDNbMKgT | 0.388641 | 0.955166 | 4.725812 | 8.432525 | 1.0 | 8.432525 | 0.955052 | 2.542373 | 300.00 | 4637.344983 | 56.924079 | 327 | 11.222778 | False | False | True |
| 482 | 4q4ctvIRr0ZR | 0.687679 | 0.760989 | 4.598004 | 8.345674 | 1.0 | 8.345674 | 0.470645 | 9.709677 | 301.00 | 3892.035737 | 55.586408 | 1158 | 21.187222 | False | False | True |
| 207 | 2bICz2yS4YQz | 0.552059 | 0.274787 | 4.864748 | 8.123968 | 1.0 | 8.123968 | 0.526253 | 7.228916 | 300.00 | 2509.848149 | 58.378217 | 1095 | 23.986389 | False | False | True |
| 131 | 223uDa5Ke2hV | 0.672898 | 0.802181 | 4.354180 | 8.006350 | 1.0 | 8.006350 | 0.539201 | 1.714286 | 30.00 | 949.757311 | 53.034485 | 2367 | 21.339444 | False | False | True |
| 361 | 3hmcNb4X0gGJ | 0.611792 | 0.440490 | 4.633551 | 8.002609 | 1.0 | 8.002609 | 1.186349 | 2.666667 | 300.00 | 2914.620493 | 55.958453 | 549 | 14.461389 | False | False | True |
| 256 | 2wOOQjytBs5k | 1.088432 | 0.165606 | 4.480428 | 7.974679 | 1.0 | 7.974679 | 1.335022 | 1.220000 | 61.00 | 1686.822249 | 54.355826 | 2035 | 21.171111 | False | False | True |
| 356 | 3eeAlSDySqwp | 1.218776 | 0.285790 | 4.200961 | 7.806008 | 1.0 | 7.806008 | 0.272422 | 26.353333 | 395.30 | 1513.464613 | 51.430856 | 875 | 13.383056 | False | False | True |
| 108 | 1pXAjoLACCV8 | 0.531891 | 0.581032 | 4.373368 | 7.672975 | 1.0 | 7.672975 | 0.520330 | 6.593407 | 300.00 | 3308.692878 | 53.235306 | 821 | 19.942222 | False | False | True |
| 536 | 5KMSlf784G2M | 0.142101 | 0.671994 | 4.559414 | 7.653215 | 1.0 | 7.653215 | 0.667057 | 3.440476 | 144.50 | 3591.714967 | 55.182515 | 760 | 13.292222 | False | False | True |
Cinematics¶
Trajectory¶
# Convert GeoPandas to a Trajectory Collection using MovingPandas
traj_collection = mpd.TrajectoryCollection(gdf, traj_id_col=ID_COL, t=TIME_COL)
# Remove distance outliers
MAX_LEN_KM = 2_000
filtered = [traj for traj in traj_collection if traj.get_length()/1000 <= MAX_LEN_KM]
traj_collection_filtered = mpd.TrajectoryCollection(filtered)
print(f"Kept {len(traj_collection_filtered)} of {len(traj_collection)} trajectories")
Kept 723 of 728 trajectories
Distance histogram¶
traj_lengths_km = pd.Series(
[traj.get_length() / 1000 for traj in traj_collection_filtered.trajectories],
name="trajectory_length_km",
)
bin_width = 100
max_len = np.ceil(traj_lengths_km.max() / bin_width) * bin_width
bins = np.arange(0, max_len + bin_width, bin_width)
bin_centers = 0.5 * (bins[1:] + bins[:-1])
fig, ax = plt.subplots(figsize=(10, 4))
# Cumulative share on left axis
counts_raw, _ = np.histogram(traj_lengths_km, bins=bins)
cumulative = counts_raw.cumsum() / counts_raw.sum()
ax.plot(bin_centers, cumulative, color="C1", marker="o", ms=3, label="Cumulative share")
ax.set_ylabel("Cumulative share")
ax.set_ylim(0, 1.15)
ax.yaxis.grid(True, linestyle="--", alpha=0.6)
ax.set_title("Trajectory lengths")
# Histogram on right axis
ax_count = ax.twinx()
counts, edges, patches = ax_count.hist(
traj_lengths_km, bins=bins, color="C0", alpha=0.6, edgecolor="white"
)
ax_count.tick_params(labelright=False,right=False)
ax_count.grid(False)
# Annotate each bin with its count
for rect, count in zip(patches, counts):
if count == 0:
continue
x = rect.get_x() + rect.get_width() / 2
y = rect.get_height()
ax_count.text(x, y, f"{int(count)}", ha="center", va="bottom", fontsize=8, color="black")
# Annotate each cumulative point (same color as the line)
for x, y in zip(bin_centers, cumulative):
ax.text(x, y+.02, f"{y:.2f}", ha="center", va="bottom", fontsize=8, color="C1")
# X labels: end of each interval, centered on the bar
ax.set_xlabel("Bin upper end (km)")
ax.set_xticks(bin_centers)
ax.set_xticklabels([f"{edge:.0f}" for edge in edges[1:]], rotation=45, ha="right")
plt.tight_layout()
plt.show()
Long Distance Trajectories Map¶
# Remove short trips
MIN_LEN_KM = 50
filtered = [traj for traj in traj_collection_filtered if traj.get_length()/1000 >= MIN_LEN_KM]
traj_collection_longdistance = mpd.TrajectoryCollection(filtered)
print(f"Kept {len(traj_collection_longdistance)} of {len(traj_collection_filtered)} trajectories")
Kept 722 of 723 trajectories
traj_gdf = traj_collection_longdistance.to_traj_gdf().to_crs(epsg=4326)
traj_gdf.hvplot(
geo=True,
tiles="CartoLight",
line_width=1,
alpha=0.9,
datashade=True, # rasterize + shade by density
aggregator='count', # counts per pixel
cmap='hot',
hover=False, # skip tooltips for speed
width=900,
height=600,
)