ALM_LLM / tools /ts_preprocess.py
AshenH's picture
Update tools/ts_preprocess.py
500d236 verified
raw
history blame
3.68 kB
# space/tools/ts_preprocess.py
import pandas as pd
import numpy as np
from typing import List
MONTH = "MS" # month-start frequency
def _emi(principal: float, annual_rate: float, n_months: int) -> float:
"""
EMI formula with monthly compounding.
r_m = annual_rate / 12
EMI = P * r_m * (1+r_m)^n / ((1+r_m)^n - 1)
"""
if n_months <= 0 or principal <= 0:
return 0.0
r = annual_rate / 12.0
if r <= 0:
return principal / n_months
fac = (1.0 + r) ** n_months
return principal * r * fac / (fac - 1.0)
def _project_deposit(principal: float, annual_rate: float, months: int) -> pd.DataFrame:
"""
Monthly path for a deposit. Value compounds monthly.
"""
r = annual_rate / 12.0
data = []
bal = principal
for m in range(months + 1):
data.append({"step": m, "portfolio_value": bal})
bal = bal * (1.0 + r)
return pd.DataFrame(data)
def _project_asset(principal: float, annual_rate: float, tenor_months: int) -> pd.DataFrame:
"""
Monthly amortization schedule for an asset/loan using EMI.
"""
emi = _emi(principal, annual_rate, tenor_months)
r = annual_rate / 12.0
data = []
bal = principal
for m in range(tenor_months + 1):
interest = bal * r
principal_pay = max(0.0, emi - interest)
next_bal = max(0.0, bal - principal_pay)
data.append({
"step": m,
"portfolio_value": bal,
"emi": emi,
"interest_component": interest,
"principal_component": principal_pay,
"remaining_balance": next_bal
})
bal = next_bal
return pd.DataFrame(data)
def build_timeseries(df: pd.DataFrame) -> pd.DataFrame:
"""
Input df columns (example):
- portfolio_date (datetime or str)
- instrument_type: 'Deposit' or 'Asset'
- balance: float
- interest_rate: annual rate (e.g., 0.12)
- time_to_maturity: months (int)
- tenor_months: months (for Assets; if missing, fallback to time_to_maturity)
Output:
Long time-series with monthly timestamps, projected 'portfolio_value'
(and EMI breakdown for Assets).
"""
df = df.copy()
if "timestamp" not in df.columns:
df["timestamp"] = pd.to_datetime(df["portfolio_date"])
out_frames: List[pd.DataFrame] = []
for _, row in df.iterrows():
itype = str(row.get("instrument_type", "")).strip().lower()
start = pd.to_datetime(row["timestamp"])
months = int(row.get("time_to_maturity", 0) or 0)
principal = float(row.get("balance", 0.0) or 0.0)
annual_rate = float(row.get("interest_rate", 0.0) or 0.0)
if itype == "deposit":
sched = _project_deposit(principal, annual_rate, months)
elif itype == "asset":
tenor = int(row.get("tenor_months", months) or months or 0)
sched = _project_asset(principal, annual_rate, tenor)
else:
# unknown types: keep flat
sched = pd.DataFrame({"step": range(months + 1), "portfolio_value": principal})
# Build timestamps: month-start frequency
sched["timestamp"] = pd.date_range(start=start, periods=len(sched), freq=MONTH)
# Carry identifiers
for col in ["instrument_type", "interest_rate"]:
if col in df.columns:
sched[col] = row.get(col)
sched["origin_portfolio_date"] = start
sched["origin_balance"] = principal
out_frames.append(sched)
ts = pd.concat(out_frames, ignore_index=True)
ts = ts.sort_values(["timestamp", "instrument_type"]).reset_index(drop=True)
return ts