diff --git a/biome.json b/biome.json index 353ba7be..9b0ea9f9 100644 --- a/biome.json +++ b/biome.json @@ -98,6 +98,9 @@ }, "complexity": { "useLiteralKeys": "off" + }, + "suspicious": { + "noMisplacedAssertion": "off" } } } diff --git a/playground/arrays.html b/playground/arrays.html new file mode 100644 index 00000000..e386a8a6 --- /dev/null +++ b/playground/arrays.html @@ -0,0 +1,336 @@ + + + + + + tsb β€” pd.arrays: Nullable Typed Extension Arrays + + + +
← tsb playground
+
+

πŸ”’ pd.arrays β€” Nullable Typed Extension Arrays

+

Mirrors pandas.arrays: nullable integers, floats, booleans, strings, datetimes, timedeltas.

+ βœ… Complete + +

Overview

+

+ The pd.arrays namespace provides typed extension arrays with first-class support + for missing values (NA). Each array type stores values and a boolean mask separately β€” when + mask[i] = true the element at position i is NA (missing). +

+

+ These arrays mirror the pandas nullable array types introduced in pandas 1.0+. They differ from + plain JavaScript arrays in that null / undefined are never stored in + the data buffer β€” missing values are tracked by a separate mask, enabling efficient aggregate + operations that skip NA values. +

+ +

Quick Start

+
import {
+  arrays,
+  IntegerArray,
+  FloatingArray,
+  BooleanArray,
+  StringArray,
+  DatetimeArray,
+  TimedeltaArray,
+} from "tsb";
+
+// Nullable integer array
+const ints = arrays.IntegerArray.from([1, 2, null, 4, 5], "Int32");
+ints.dtype;              // "Int32"
+ints.toArray();          // [1, 2, null, 4, 5]
+ints.sum();              // 12
+ints.fillna(0).toArray(); // [1, 2, 0, 4, 5]
+
+// Nullable float array
+const floats = arrays.FloatingArray.from([1.5, NaN, 3.0]);
+floats.mean();           // 2.25  (NaN treated as NA)
+
+// Nullable boolean β€” three-valued logic
+const bools = arrays.BooleanArray.from([true, null, false]);
+bools.any();             // true
+bools.all();             // false
+
+// Nullable string array
+const strs = arrays.StringArray.from(["hello", null, "world"]);
+strs.upper().toArray();  // ["HELLO", null, "WORLD"]
+strs.len().toArray();    // [5, null, 5]
+ +

Array Types

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Classpandas equivalentDtypesNA behaviour
IntegerArraypandas.arrays.IntegerArrayInt8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64null / undefined β†’ NA
FloatingArraypandas.arrays.FloatingArrayFloat32, Float64null, undefined, NaN β†’ NA
BooleanArraypandas.arrays.BooleanArray"boolean"Kleene 3-valued logic
StringArraypandas.arrays.StringArray"string"null / undefined β†’ NA
DatetimeArraypandas.arrays.DatetimeArray"datetime64[ns]"NA preserved through all ops
TimedeltaArraypandas.arrays.TimedeltaArray"timedelta64[ns]"NA preserved through all ops
+ +

IntegerArray

+
import { IntegerArray } from "tsb";
+
+// Construction
+const a = IntegerArray.from([1, 2, null, 4], "Int32");
+a.dtype;          // "Int32"
+a.size;           // 4
+a.at(2);          // null  (NA)
+a.isna();         // [false, false, true, false]
+
+// Arithmetic (NA propagates)
+a.add(10).toArray();    // [11, 12, null, 14]
+a.mul(2).toArray();     // [2, 4, null, 8]
+a.floordiv(2).toArray(); // [0, 1, null, 2]
+
+// Reductions
+a.sum();          // 7
+a.mean();         // 7/3 β‰ˆ 2.33
+a.min();          // 1
+a.max();          // 4
+a.count();        // 3
+
+// Fill and drop NA
+a.fillna(0).toArray();  // [1, 2, 0, 4]
+a.dropna();             // [1, 2, 4]
+
+// Type conversion
+a.astype("Int64");
+ +

FloatingArray

+
import { FloatingArray } from "tsb";
+
+const f = FloatingArray.from([1.0, 2.5, NaN, 4.5]);
+// NaN is treated as NA
+f.toArray();      // [1.0, 2.5, null, 4.5]
+
+// Statistics
+f.sum();          // 8.0
+f.mean();         // 8.0 / 3 β‰ˆ 2.67
+f.std();          // sample standard deviation (ddof=1)
+f.min();          // 1.0
+f.max();          // 4.5
+
+// Arithmetic
+f.add(f).toArray();  // [2.0, 5.0, null, 9.0]
+f.pow(2).toArray();  // [1.0, 6.25, null, 20.25]
+ +

BooleanArray β€” Three-Valued Logic

+
import { BooleanArray } from "tsb";
+
+const b = BooleanArray.from([true, null, false]);
+b.any();          // true
+b.all();          // false
+b.sum();          // 1  (count of true elements)
+
+// Kleene logic:  false AND NA β†’ false,  true AND NA β†’ NA
+const x = BooleanArray.from([true,  false, null, true ]);
+const y = BooleanArray.from([true,  null,  true, false]);
+x.and(y).toArray(); // [true, false, null, false]
+x.or(y).toArray();  // [true, null,  true, false]  β€” note: false OR NA = NA
+x.not().toArray();  // [false, null,  true, false]
+ +

StringArray

+
import { StringArray } from "tsb";
+
+const s = StringArray.from(["  Hello  ", null, "world"]);
+
+s.strip().toArray();       // ["Hello", null, "world"]
+s.upper().toArray();       // ["  HELLO  ", null, "WORLD"]
+s.lower().toArray();       // ["  hello  ", null, "world"]
+s.replace("o", "0").toArray(); // ["  Hell0  ", null, "w0rld"]
+
+// Pattern matching β†’ BooleanArray
+s.strip().contains("Hello").toArray();   // [true, null, false]
+s.strip().startswith("H").toArray();     // [true, null, false]
+s.strip().endswith("d").toArray();       // [false, null, true]
+
+// Lengths β†’ IntegerArray
+s.strip().len().toArray();   // [5, null, 5]
+
+// Concatenation
+const a = StringArray.from(["foo", "bar"]);
+const b = StringArray.from(["baz", "qux"]);
+a.cat("-", b).toArray(); // ["foo-baz", "bar-qux"]
+ +

DatetimeArray

+
import { DatetimeArray, Timestamp } from "tsb";
+
+const dts = DatetimeArray.from([
+  "2024-01-15T10:30:00Z",
+  null,
+  "2024-06-21T00:00:00Z",
+]);
+dts.dtype;   // "datetime64[ns]"
+dts.year;    // [2024, null, 2024]
+dts.month;   // [1, null, 6]
+dts.day;     // [15, null, 21]
+dts.hour;    // [10, null, 0]
+
+// Min / max
+dts.min(); // Timestamp("2024-01-15T10:30:00Z")
+dts.max(); // Timestamp("2024-06-21T00:00:00Z")
+
+// Fill NA
+const fill = new Timestamp("2000-01-01");
+dts.fillna(fill).toArray();  // no nulls
+
+// Millisecond timestamps
+dts.asMs();  // [number, null, number]
+ +

TimedeltaArray

+
import { TimedeltaArray, Timedelta } from "tsb";
+
+const tds = TimedeltaArray.from([
+  Timedelta.fromComponents({ days: 1 }),
+  null,
+  86_400_000 * 2,         // 2 days in ms
+  "P3DT6H",               // ISO 8601 duration
+]);
+tds.dtype;     // "timedelta64[ns]"
+tds.days;      // [1, null, 2, 3]
+tds.hours;     // [0, null, 0, 6]
+tds.totalSeconds; // [86400, null, 172800, 291600]
+
+// Arithmetic
+const extra = Timedelta.fromComponents({ hours: 12 });
+tds.add(extra).days; // [1, null, 2, 3] (hours += 12)
+tds.mul(2).totalDays; // [2, null, 4, 7]
+
+// Reductions
+tds.sum()?.totalDays;  // 6.25  (1 + 2 + 3.25)
+tds.min();             // Timedelta(1 day)
+tds.max();             // Timedelta(3 days 6 hours)
+ +

Shared API (all array types)

+
// Every array type exposes the same base interface:
+
+a.size;               // number of elements (including NA)
+a.dtype;              // dtype string
+a.at(i);              // element at index i, or null (supports negative)
+a.isna();             // boolean[] β€” true where NA
+a.notna();            // boolean[] β€” true where not NA
+a.hasNa();            // boolean β€” true if any NA
+a.toArray();          // (T | null)[] β€” plain JS array with nulls
+a.dropna();           // T[] β€” non-NA values only
+a.fillna(value);      // new array with NA replaced by value
+[...a];               // iterable over (T | null) elements
+ +
+ πŸ’‘ pandas.array() analogue
+ tsb also exports pdArray(values, dtype) β€” a universal factory that returns a + PandasArray. The typed arrays here provide more specific operations (arithmetic, + string methods, etc.) and should be preferred when the element type is known. +
+ +

Design Notes

+

+ All nullable arrays store a parallel _mask: boolean[] where true + means NA. The data buffer _data: T[] always has a sentinel value at masked + positions (typically 0, false, or "") β€” these values are never + exposed through the public API. +

+

+ Integer arithmetic truncates toward zero. Float32 values are rounded with + Math.fround. Integer arrays validate bounds on construction. All operations that + return a new array preserve the dtype of the input unless astype() is called. +

+ + diff --git a/playground/bootstrap.html b/playground/bootstrap.html new file mode 100644 index 00000000..9c19d055 --- /dev/null +++ b/playground/bootstrap.html @@ -0,0 +1,191 @@ + + + + + + tsb β€” Bootstrap Confidence Intervals + + + + +

Bootstrap Confidence Intervals

+

+ Non-parametric bootstrap resampling for any statistic β€” mirrors + scipy.stats.bootstrap. Supports percentile, + basic, and BCa methods. +

+ +

1 β€” Basic 95% CI for the mean (BCa)

+

+import { bootstrap1 } from "tsb";
+
+const data = [14, 18, 11, 13, 6, 8, 2, 18, 14, 19,
+              11, 14, 1, 3, 12, 9, 8, 15, 16, 7];
+
+function mean(xs) {
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+
+const result = bootstrap1(data, mean, { n: 5000, seed: 42 });
+
+console.log("CI low :", result.confidenceInterval.low.toFixed(3));
+console.log("CI high:", result.confidenceInterval.high.toFixed(3));
+console.log("SE     :", result.standardError.toFixed(3));
+    
+ +
+ +

2 β€” Compare percentile, basic, and BCa methods

+

+import { bootstrap1 } from "tsb";
+
+// Skewed data (log-normal like)
+const data = [1, 1, 2, 2, 3, 5, 8, 13, 21, 34, 55];
+
+function mean(xs) {
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+
+for (const method of ["percentile", "basic", "bca"]) {
+  const r = bootstrap1(data, mean, { n: 4000, seed: 0, method });
+  const { low, high } = r.confidenceInterval;
+  console.log(`${method.padEnd(10)} CI: [${low.toFixed(2)}, ${high.toFixed(2)}]`);
+}
+    
+ +
+ +

3 β€” CI for median (BCa)

+

+import { bootstrap1 } from "tsb";
+
+const data = [5, 2, 9, 1, 8, 3, 7, 4, 6, 10, 11, 3, 7, 2];
+
+function median(xs) {
+  const s = [...xs].sort((a, b) => a - b);
+  const m = Math.floor(s.length / 2);
+  return s.length % 2 ? s[m] : (s[m - 1] + s[m]) / 2;
+}
+
+const r = bootstrap1(data, median, { n: 4999, seed: 7, method: "bca" });
+console.log("Observed median:", median(data));
+console.log("95% BCa CI     :", r.confidenceInterval.low.toFixed(2), "–", r.confidenceInterval.high.toFixed(2));
+    
+ +
+ +

4 β€” Two-sample bootstrap (mean difference)

+

+import { bootstrap } from "tsb";
+
+const control   = [4.2, 5.1, 3.8, 4.9, 5.5, 4.4, 4.0, 5.2, 4.7, 4.6];
+const treatment = [6.1, 5.8, 6.4, 5.9, 7.0, 6.2, 5.7, 6.8, 6.3, 6.5];
+
+function meanDiff(a, b) {
+  const avg = xs => xs.reduce((s, x) => s + x, 0) / xs.length;
+  return avg(b) - avg(a);
+}
+
+const r = bootstrap([control, treatment], meanDiff, { n: 5000, seed: 123 });
+console.log("Observed diff:", meanDiff(control, treatment).toFixed(3));
+console.log("95% BCa CI   :", r.confidenceInterval.low.toFixed(3), "–", r.confidenceInterval.high.toFixed(3));
+console.log("SE           :", r.standardError.toFixed(4));
+    
+ +
+ +

5 β€” Bootstrap distribution histogram

+

+import { bootstrap1 } from "tsb";
+
+const data = Array.from({ length: 30 }, (_, i) => i + 1);
+
+function mean(xs) {
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+
+const r = bootstrap1(data, mean, { n: 2000, seed: 99, method: "percentile" });
+
+// Bin the distribution into 10 buckets
+const dist = r.bootDistribution;
+const min = Math.min(...dist);
+const max = Math.max(...dist);
+const bins = 10;
+const width = (max - min) / bins;
+const counts = new Array(bins).fill(0);
+for (const v of dist) {
+  const idx = Math.min(bins - 1, Math.floor((v - min) / width));
+  counts[idx]++;
+}
+const barMax = Math.max(...counts);
+for (let i = 0; i < bins; i++) {
+  const lo = (min + i * width).toFixed(1);
+  const bar = "β–ˆ".repeat(Math.round(counts[i] / barMax * 20));
+  console.log(`${lo.padStart(5)}: ${bar} (${counts[i]})`);
+}
+console.log(`\n95% CI: [${r.confidenceInterval.low.toFixed(2)}, ${r.confidenceInterval.high.toFixed(2)}]`);
+    
+ +
+ +

+ ← back to index +

+ + + + diff --git a/playground/case_when.html b/playground/case_when.html new file mode 100644 index 00000000..46e4fe92 --- /dev/null +++ b/playground/case_when.html @@ -0,0 +1,434 @@ + + + + + + tsb β€” case_when + + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap +

case_when

+

Conditional value selection using CASE WHEN semantics β€” mirrors pandas.Series.case_when() (pandas 2.2+).

+ +
+

1 β€” Basic grade classification

+

caseWhen(series, caselist) applies an ordered list of [condition, replacement] pairs. The first matching condition determines the output; if no condition matches the original value is kept.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

2 β€” Using boolean Series as conditions

+

Conditions can be boolean Series objects (e.g. from comparison operations).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

3 β€” Using predicate functions

+

Conditions can be predicate functions (value, index) => boolean.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

4 β€” Series as replacement values

+

Replacements can be Series objects β€” the matching positional value is used.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

5 β€” Unmatched rows keep original values

+

Any row not matched by any condition retains its original value β€” there is no implicit "else" replacement.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

6 β€” First matching condition wins

+

When multiple conditions match the same row, the first one in caselist takes effect β€” just like CASE WHEN … THEN … WHEN … THEN … END in SQL.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

7 β€” Positional index in predicate

+

Predicate functions receive both the value and its positional index as the second argument.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

8 β€” String Series classification

+

caseWhen works on any Series type β€” numbers, strings, booleans, or mixed.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ +
+

9 β€” Comparison with where / mask

+

caseWhen generalises whereSeries to multiple branches. Use whereSeries for a single condition; use caseWhen for multi-branch logic.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + + + + + diff --git a/playground/contingency.html b/playground/contingency.html new file mode 100644 index 00000000..e63094ba --- /dev/null +++ b/playground/contingency.html @@ -0,0 +1,424 @@ + + + + + + Contingency Table Analysis β€” tsb playground + + + + + +

πŸ“Š Contingency Table Analysis scipy.stats.contingency

+

+ Association and effect-size measures for contingency tables, mirroring + scipy.stats.contingency. Implemented from scratch with no external + dependencies. +

+ + + +

1 Β· Expected Frequencies

+

+ Under independence, each cell's expected count is
+ E[i,j] = rowTotal[i] Γ— colTotal[j] / grandTotal. +

+
+ + + +
Press Run to see expected frequencies.
+
+ + +

2 Β· Relative Risk (Risk Ratio)

+

+ For a 2Γ—2 table [[a, b], [c, d]]:
+ RR = (a / (a+b)) / (c / (c+d)) +

+
+ + + +
Press Run to compute relative risk.
+
+ + +

3 Β· Odds Ratio

+

+ For a 2Γ—2 table [[a, b], [c, d]]:
+ OR = (a Γ— d) / (b Γ— c) +

+
+ + + +
Press Run to compute odds ratio.
+
+ + +

4 Β· Association Strength

+

+ Measures of association derived from the chi-square statistic. All return + values in [0, 1] (phi may exceed 1 for tables larger than 2Γ—2). +

+
+ + + +
Press Run to compare association measures.
+
+ + +

5 Β· Real-World Example: Clinical Trial

+

+ A drug trial: 200 patients treated (40 recovered), 200 control (25 recovered). +

+
+ + +
Press Analyse for full summary.
+
+ + + + + + diff --git a/playground/feather.html b/playground/feather.html new file mode 100644 index 00000000..5fa2aeb4 --- /dev/null +++ b/playground/feather.html @@ -0,0 +1,357 @@ + + + + + + tsb β€” readFeather & toFeather + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap + +

πŸͺΆ Apache Arrow Feather v2 I/O

+

+ readFeather(data, options?) and toFeather(df, options?) + implement a pure-TypeScript Apache Arrow IPC (Feather v2) reader and writer with no + native dependencies. The format uses FlatBuffers for metadata and stores column data + contiguously with 8-byte alignment. +

+ +
+ Supported column types (read & write): Int8/16/32/64, + UInt8/16/32/64, Float32/64, Bool, + Utf8. + Null / nullable columns fully supported via Arrow validity bitmaps. + Equivalent to pandas.read_feather() / DataFrame.to_feather(). +
+ + +
+

1 Β· Basic read & write

+

Serialize a DataFrame to an Arrow IPC buffer with + toFeather() and read it back with readFeather(). + The buffer starts and ends with the ARROW1 magic bytes.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Column types β€” int, float, boolean, string

+

All major column types round-trip correctly. Integers are stored as + Int64, floats as Float64, booleans are bit-packed, and strings use + the Arrow Utf8 layout (int32 offsets + UTF-8 byte data).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· Null values β€” Arrow validity bitmaps

+

Columns with nulls have a validity bitmap prepended (1 bit per row, LSB-first). + Columns with no nulls omit the bitmap (zero-length validity buffer) to save space.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· usecols β€” selective column reads

+

Use usecols to read only a subset of columns. + Buffer tracking skips over the buffers for unselected columns, + so no extra allocation is needed.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· indexCol β€” row index from a column

+

Promote any column to the DataFrame's row index via indexCol. + Use writeIndex: true in toFeather() to persist the + index as __index_level_0__.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· Unicode strings

+

Utf8 columns store length-prefixed UTF-8 byte data. Any Unicode string β€” + including emoji, CJK characters, and accented letters β€” round-trips exactly.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + + + diff --git a/playground/flags.html b/playground/flags.html new file mode 100644 index 00000000..18c8cbf6 --- /dev/null +++ b/playground/flags.html @@ -0,0 +1,300 @@ + + + + + + tsb β€” Flags: metadata for DataFrame and Series + + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap +

Flags: metadata for DataFrame and Series

+

+ Mirrors + pandas.DataFrame.flags β€” controls duplicate-label behaviour. +

+ + +
+

1 Β· Default flags

+

+ Every DataFrame and Series exposes a + flags getter returning a Flags object. + By default, allowsDuplicateLabels is true. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Setting flags

+

+ Mutate allowsDuplicateLabels directly on the + Flags object. The change is shared across all + Flags wrappers for the same underlying object. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· DuplicateLabelError

+

+ Setting allowsDuplicateLabels = false on an object with + duplicate index labels immediately throws a + DuplicateLabelError. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· copy() and raiseOnDuplicates()

+

+ Flags.copy() returns a new wrapper sharing the same state. + raiseOnDuplicates() validates only when + allowsDuplicateLabels is false. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + + + + + diff --git a/playground/fwf.html b/playground/fwf.html new file mode 100644 index 00000000..8435429c --- /dev/null +++ b/playground/fwf.html @@ -0,0 +1,504 @@ + + + + + + tsb β€” readFwf + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“ readFwf β€” Interactive Playground

+

+ Parse fixed-width formatted text into a + DataFrame with readFwf(). + Mirrors pandas + read_fwf() β€” column boundaries are inferred from whitespace patterns + automatically, or provided explicitly via colspecs / widths.
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Auto column-width inference

+

When colspecs is omitted (default "infer"), + readFwf() scans the data rows and identifies separator positions β€” + character columns where every row contains a space. This mirrors + pandas.read_fwf(colspecs='infer').

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Explicit colspecs

+

Provide colspecs β€” an array of [start, end) character + index pairs β€” for precise control over column boundaries. Useful when separator + positions vary between rows.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· Column widths

+

Alternatively, pass widths β€” an array of integers β€” to define + consecutive column widths. This produces [0,w0], [w0,w0+w1], … + colspecs internally.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· Missing values & dtype forcing

+

Standard NA strings (NA, NaN, null, …) are + recognised automatically. Add custom NA strings with naValues. + Force a column's dtype with the dtype option.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

5 Β· Index column, row limits & skip rows

+

Promote a column to the row index with indexCol. + Limit rows with nRows and skip leading data rows with + skipRows.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

6 Β· Real-world: Census-style population table

+

Fixed-width format is common in government datasets, legacy mainframe exports, + and statistical software output. Here is a Census-style table.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

Parse a fixed-width formatted text string into a DataFrame. + Equivalent to pandas.read_fwf().

+
readFwf(text: string, options?: ReadFwfOptions): DataFrame
+
+type ColSpec = readonly [number, number];   // [start, end) character indices
+
+interface ReadFwfOptions {
+  colspecs?:   readonly ColSpec[] | "infer"; // column boundaries (default: "infer")
+  widths?:     readonly number[];            // column widths (alternative to colspecs)
+  inferNrows?: number;                       // rows to sample for inference (default: 100)
+  header?:     number | null;               // header row index (default: 0)
+  names?:      readonly string[];           // explicit column names
+  indexCol?:   string | number | null;      // column to use as row index
+  dtype?:      Record<string, DtypeName>; // force dtype for named columns
+  naValues?:   readonly string[];           // extra NA string values
+  skipRows?:   number;                      // data rows to skip after header
+  nRows?:      number;                      // maximum data rows to read
+}
+
+ + + + + diff --git a/playground/hdf.html b/playground/hdf.html new file mode 100644 index 00000000..e6a3df08 --- /dev/null +++ b/playground/hdf.html @@ -0,0 +1,400 @@ + + + + + + tsb β€” readHdf & toHdf + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap + +

πŸ—‚οΈ HDF5 I/O

+

+ readHdf(data, options?) and toHdf(df, options?) + implement a pure-TypeScript HDF5 v0 Superblock reader and writer with no + native dependencies. Each file encodes a single DataFrame under a + configurable HDF5 group key (default "df"). The format is compatible + with pandas.read_hdf() / DataFrame.to_hdf(). +

+ +
+ Supported column types: Float64/Float32, + Int8/16/32/64, UInt8/16/32/64, + Bool (stored as UInt8), + String (fixed-length null-padded UTF-8). + Max 120 columns per DataFrame. One DataFrame per file (single HDF5 group key). +
+ + +
+

1 Β· Basic read & write

+

Serialize a DataFrame to an HDF5 binary buffer with + toHdf() and read it back with readHdf(). + The buffer begins with the standard HDF5 magic bytes + 0x89 HDF\r\n\x1a\n.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Column types β€” int, float, boolean, string

+

HDF5 stores numeric types as contiguous fixed-width binary arrays. + Booleans are stored as UInt8 (0 or 1). + Strings are fixed-length null-padded UTF-8 β€” the element size is the + byte length of the longest string in the column.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· Custom HDF5 group key

+

The HDF5 group key specifies where within the file the DataFrame is stored. + The default is "df". A leading / is stripped + automatically (both in write and read).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· usecols β€” selective column reads

+

Pass usecols to read only a subset of columns from the file. + Unselected columns are skipped during dataset parsing.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· writeIndex & indexCol β€” persisting the row index

+

Use writeIndex: true to store the DataFrame's row index as an + extra column named __index__. When reading back, pass + indexCol: "__index__" to restore it as the row index.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· Unicode strings

+

Strings are stored as fixed-length null-padded UTF-8 arrays. The element + size is the byte length of the longest encoded string. Any Unicode string β€” + including emoji, CJK, and accented characters β€” round-trips exactly.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

7 Β· Special float values β€” NaN, Infinity

+

IEEE 754 special values round-trip correctly since the data is stored + as raw binary float64 without any encoding layer.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + + + diff --git a/playground/holiday.html b/playground/holiday.html new file mode 100644 index 00000000..4d9e3561 --- /dev/null +++ b/playground/holiday.html @@ -0,0 +1,505 @@ + + + + + + tsb β€” Holiday Calendars (pandas.tseries.holiday) + + + +
+ ← Back to playground + +
+

πŸ—“οΈ Holiday Calendars

+

+ New + pandas.tseries.holiday + Holiday calendars, observance rules, and US Federal holidays β€” all from scratch. +

+
+ + +

1. US Federal Holiday Calendar

+
+

Query year range:

+
+ + + +
+

+    
+ + +

2. Custom Holiday Calendar

+
+

+ Build a calendar from arbitrary holiday rules using the + Holiday class and observance functions. +

+
+
+

Code

+ +
+
+

Output

+
(click Run)
+
+
+ +
+ + +

3. Observance Functions

+
+

See how observance functions shift weekend holidays:

+
+
+ + +

4. Floating Holidays with Weekday Offsets

+
+

+ MO(n), TH(n) etc. find the n-th occurrence + of a weekday on/after the base date β€” powering "last Monday of May" rules. +

+
+
+

Code

+ +
+
+

Output

+
(click Run)
+
+
+ +
+ + +

5. Calendar Registry

+
+
+
+

Code

+ +
+
+

Output

+
(click Run)
+
+
+ +
+
+ + + + diff --git a/playground/hypothesis_tests.html b/playground/hypothesis_tests.html new file mode 100644 index 00000000..ab196997 --- /dev/null +++ b/playground/hypothesis_tests.html @@ -0,0 +1,471 @@ + + + + + + tsb β€” Hypothesis Tests (scipy-style) + + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap +

Hypothesis Tests (scipy-style)

+

+ ← tsb playground Β· + t-tests, chi-square, ANOVA, normality, correlation, Mann-Whitney U, Kolmogorov-Smirnov +

+ + +
+

1 Β· One-sample t-test β€” ttest1samp

+

+ Test whether the mean of a sample equals a hypothesised population mean. + Returns { statistic, pvalue }. Mirrors + scipy.stats.ttest_1samp(a, popmean). +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· One-sided t-test β€” alternative option

+

+ Use alternative: "greater" or "less" for + one-tailed tests. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· Independent t-test β€” ttestInd (Welch's)

+

+ Compare means of two independent groups. Defaults to Welch's t-test + (unequal variances). Mirrors scipy.stats.ttest_ind. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· Paired t-test β€” ttestRel

+

+ Compare measurements on the same subjects before and after an + intervention. Mirrors scipy.stats.ttest_rel(a, b). +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

5 Β· Chi-square test for independence β€” chi2Contingency

+

+ Test whether two categorical variables are independent using a + contingency table. Mirrors scipy.stats.chi2_contingency. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

6 Β· One-way ANOVA β€” fOneway

+

+ Test whether two or more groups have equal population means. + F = between-group variance / within-group variance. + Mirrors scipy.stats.f_oneway(*groups). +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

7 Β· Jarque-Bera normality test β€” jarqueBera

+

+ Test Hβ‚€: data is normally distributed, using sample skewness and + kurtosis. JB ~ χ²(2) under Hβ‚€. + Mirrors scipy.stats.jarque_bera(data). +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

8 Β· Pearson correlation β€” pearsonr

+

+ Compute the Pearson product-moment correlation coefficient and its + p-value (Hβ‚€: r = 0). Mirrors scipy.stats.pearsonr(x, y). +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

9 Β· Spearman rank correlation β€” spearmanr

+

+ Non-parametric rank-based correlation. Robust to outliers and + non-normal data. Mirrors scipy.stats.spearmanr(x, y). +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

10 Β· Mann-Whitney U test β€” mannWhitneyU

+

+ Non-parametric alternative to the independent t-test. Tests whether + one population tends to have larger values than another. + Mirrors scipy.stats.mannwhitneyu. +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

11 Β· Kolmogorov-Smirnov test β€” kstest

+

+ Test whether data follows a specified distribution (e.g. normal, + uniform). D = max|F_n(x) βˆ’ F(x)|. + Mirrors scipy.stats.kstest(data, cdf). +

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + + + + + diff --git a/playground/index.html b/playground/index.html index e5b0bf00..15e6b592 100644 --- a/playground/index.html +++ b/playground/index.html @@ -235,6 +235,11 @@

Wide-to-long reshape. Unpivot columns into variable/value pairs with id_vars, value_vars, var_name, value_name.

βœ… Complete
+
+

↕ lreshape

+

Wide-to-long reshape with named column groups. Stack multiple wide columns into long columns with explicit grouping, dropna support.

+
βœ… Complete
+

πŸ”„ pivot & pivotTable

Reshape with aggregation. pivot() for unique reshaping; pivotTable() for aggregation (mean/sum/count/min/max/first/last) with fill_value and dropna support.

@@ -330,6 +335,11 @@

Attach arbitrary key→value metadata to any Series or DataFrame via a WeakMap registry. Provides getAttrs, setAttrs, updateAttrs, copyAttrs, withAttrs, mergeAttrs, clearAttrs, getAttr, setAttr, deleteAttr, attrsCount, attrsKeys. Mirrors pandas.DataFrame.attrs / pandas.Series.attrs.

βœ… Complete

+
+

🚩 flags β€” Metadata Flags

+

Metadata flags for DataFrame and Series. The flags getter returns a Flags object with allowsDuplicateLabels property. Setting allowsDuplicateLabels = false on an object with duplicate index labels raises DuplicateLabelError. Mirrors pandas.DataFrame.flags / pandas.core.flags.Flags.

+
βœ… Complete
+

πŸ”€ string_ops β€” Standalone String Ops

Module-level string utilities: strNormalize (Unicode NFC/NFD/NFKC/NFKD), strGetDummies (one-hot DataFrame), strExtractAll (all regex matches), strRemovePrefix, strRemoveSuffix, strTranslate (char-level substitution), strCharWidth (CJK-aware display width), strByteLength. Works on Series, arrays, or scalars.

@@ -501,6 +511,91 @@

βœ… Complete

+
+

πŸ“„ readXml / toXml β€” pd.read_xml() / DataFrame.to_xml()

+

readXml(text, opts?) / toXml(df, opts?) β€” parse XML into DataFrames and serialize back. rowTag auto-detection, attributes, CDATA, entities, namespaces, usecols, nrows, indexCol. Mirrors pandas.read_xml() / DataFrame.to_xml().

+
βœ… Complete
+
+
+

πŸ“‹ readTable β€” pd.read_table()

+

readTable(text, opts?) β€” parse delimiter-separated text into a DataFrame. Defaults to tab separator; all ReadCsvOptions forwarded. Mirrors pandas.read_table().

+
βœ… Complete
+
+
+

πŸ—„οΈ SQL I/O β€” pd.read_sql() / DataFrame.to_sql()

+

readSql / readSqlQuery / readSqlTable / toSql β€” adapter-based SQL I/O. Bring your own DB driver; zero runtime dependencies. Mirrors pandas.read_sql(), read_sql_query(), read_sql_table(), DataFrame.to_sql().

+
βœ… Complete
+
+
+

πŸ“Š readStata & toStata β€” pd.read_stata() / DataFrame.to_stata()

+

readStata / toStata β€” Stata DTA binary file I/O. Supports reading v114/115 (old binary) and v117/118/119 (new XML-tagged) formats; writes v118. Missing values, string columns, value labels (convertCategoricals). Mirrors pandas.read_stata(), DataFrame.to_stata().

+
βœ… Complete
+
+
+

πŸ“¦ readParquet & toParquet β€” pd.read_parquet() / DataFrame.to_parquet()

+

readParquet / toParquet β€” Apache Parquet binary file I/O. Pure-TypeScript Thrift compact protocol, PLAIN encoding, INT32/INT64/DOUBLE/BOOLEAN/BYTE_ARRAY types, optional columns, usecols/nRows/indexCol/writeIndex. Mirrors pandas.read_parquet(), DataFrame.to_parquet().

+
βœ… Complete
+
+
+

πŸ“ readFwf β€” pd.read_fwf()

+

readFwf(text, opts?) β€” read fixed-width formatted text into a DataFrame. Auto-infers column boundaries from whitespace patterns; supports explicit colspecs / widths, header, names, indexCol, NA handling, dtype forcing, skipRows, nRows. Mirrors pandas.read_fwf().

+
βœ… Complete
+
+
+

πŸ”€ case_when β€” pd.Series.case_when()

+

caseWhen(series, caselist) β€” conditional value selection using ordered CASE WHEN semantics. Mirrors pandas.Series.case_when() (pandas 2.2+).

+
βœ… Complete
+
+
+

πŸ—‚οΈ readHdf & toHdf β€” pd.read_hdf() / DataFrame.to_hdf()

+

readHdf / toHdf β€” HDF5 v0 Superblock binary file I/O. Pure-TypeScript, no native deps. Float64/32, Int/UInt 8–64, Bool, fixed-length UTF-8 strings. usecols, indexCol, writeIndex, custom key. Mirrors pandas.read_hdf(), DataFrame.to_hdf().

+
βœ… Complete
+
+
+

πŸ”’ pd.arrays β€” Nullable Typed Extension Arrays

+

Nullable typed arrays: IntegerArray, FloatingArray, BooleanArray, StringArray, DatetimeArray, TimedeltaArray. Three-valued logic, NA masking, element-wise arithmetic, string ops. Mirrors pandas.arrays.

+
βœ… Complete
+
+
+

πŸ—“οΈ Holiday Calendars β€” pd.tseries.holiday

+

Holiday calendar system: Holiday rules (fixed & floating), AbstractHolidayCalendar, USFederalHolidayCalendar (11 US federal holidays), observance helpers (nearestWorkday, sundayToMonday, …), and weekday offsets (MO, TH, …). Mirrors pandas.tseries.holiday.

+
βœ… Complete
+
+
+

πŸ•³οΈ SparseArray & SparseDtype β€” pd.arrays.SparseArray

+

Memory-efficient sparse storage for arrays with many repeated (fill) values. SparseArray stores only non-fill values and their positions. Properties: sp_values, sp_index, density, npoints. Aggregations: sum, mean, max, min, std. Mirrors pandas.arrays.SparseArray and pandas.SparseDtype.

+
βœ… Complete
+
+
+

πŸ”¬ Hypothesis Tests β€” scipy.stats t-tests, chiΒ², ANOVA, KS

+

scipy-style statistical hypothesis tests implemented from scratch: ttest1samp, ttestInd (Welch's), ttestRel (paired), chi2Contingency, fOneway (ANOVA), jarqueBera (normality), pearsonr, spearmanr, mannWhitneyU, kstest. Returns { statistic, pvalue }.

+
βœ… Complete
+
+
+

πŸ“ Regression β€” linregress, polyfit, OLS

+

Linear and polynomial regression from scratch: linregress (simple OLS with slope, r, p, stderr), polyfit / polyval (polynomial least squares), and OLS class (multiple regression with RΒ², F-test, AIC, BIC, predict(), summary()). Mirrors scipy.stats.linregress, numpy.polyfit, and statsmodels.OLS.

+
βœ… Complete
+
+
+

πŸ“Š Contingency Tables β€” expectedFreq, relativeRisk, oddsRatio, association

+

Association and effect-size measures for contingency tables: expectedFreq (expected cell counts under independence), relativeRisk (risk ratio with log-normal CI), oddsRatio (Woolf CI), and association (CramΓ©r's V, phi, Pearson's C, Tschuprow's T). Mirrors scipy.stats.contingency.

+
βœ… Complete
+
+
+

πŸ”­ Multivariate Analysis β€” mahalanobis, PCA

+

Multivariate statistical analysis: mahalanobis distance (Σ⁻¹-weighted Euclidean, mirrors scipy.spatial.distance.mahalanobis), PCA class (eigendecomposition of the covariance matrix, mirrors sklearn.decomposition.PCA), plus covMatrix and invertMatrix helpers.

+
βœ… Complete
+
+
+

🎲 Bootstrap β€” non-parametric confidence intervals

+

Non-parametric bootstrap confidence intervals for any statistic: bootstrap (one or two samples, mirrors scipy.stats.bootstrap), bootstrap1 (single-sample convenience). Methods: percentile, basic (pivoting), and BCa (bias-corrected accelerated, default). Seeded RNG for reproducibility.

+
βœ… Complete
+
+
+

πŸ“Š Kernel Density Estimation (KDE)

+

Non-parametric density estimation using Gaussian kernels: gaussianKDE (mirrors scipy.stats.gaussian_kde). Bandwidth methods: Silverman (default), Scott, or custom factor. API: pdf, evaluate, logPdf, integrate, cdf, resample, integrateGaussian, weighted KDE.

+
βœ… Complete
+
diff --git a/playground/kde.html b/playground/kde.html new file mode 100644 index 00000000..351f8786 --- /dev/null +++ b/playground/kde.html @@ -0,0 +1,470 @@ + + + + + + tsb Β· Kernel Density Estimation (KDE) + + + +

πŸ“Š Kernel Density Estimation

+

+ Non-parametric density estimation using Gaussian kernels. + Mirrors scipy.stats.gaussian_kde. +

+ +
+

Interactive KDE Explorer

+ + + + + + + + + + + + + +
+
+ +
+

Quick-start code

+

+    
+ +
+

API reference

+

+ gaussianKDE(data, options?) β€” factory; returns a + GaussianKDE instance. +

+

+ Bandwidth methods: + silverman (default)   + scott   + number (factor Γ— Οƒ) +

+

Key methods on GaussianKDE:

+
    +
  • pdf(x) β€” density at a single point
  • +
  • evaluate(points) β€” density at an array of points
  • +
  • logPdf(x) / logpdf(points) β€” log-density
  • +
  • integrate(low, high) β€” probability mass in interval
  • +
  • cdf(x) β€” cumulative probability up to x
  • +
  • integrateFull() β€” total mass (β‰ˆ 1)
  • +
  • integrateGaussian(other) β€” analytic cross-integral
  • +
  • resample(size, seed?) β€” draw samples
  • +
  • factor β€” bandwidth h (kernel Οƒ)
  • +
  • covariance β€” hΒ² (kernel variance)
  • +
  • neff β€” effective sample size
  • +
+
+ + + + + + diff --git a/playground/lreshape.html b/playground/lreshape.html new file mode 100644 index 00000000..3f434a11 --- /dev/null +++ b/playground/lreshape.html @@ -0,0 +1,327 @@ + + + + + + tsb β€” lreshape + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

↕ lreshape β€” Interactive Playground

+

Reshape wide-format data to long format using named column groups β€” + mirrors pandas.lreshape().
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic lreshape

+

Stack two wide columns (v1, v2) into a single long + column v, repeating the id column for each block.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Multiple groups

+

Reshape with multiple output columns simultaneously. Each output column is + fed from a separate list of input columns.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· dropna option

+

By default rows where any value column is null/NaN + are dropped. Pass dropna: false to keep them.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· Real-world: survey scores

+

Stack multiple rounds of survey scores into a long-format table.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

Reshape wide-format data to long format by explicitly naming which input + columns map to each output column.

+
lreshape(
+  data: DataFrame,
+  groups: Record<string, string[]>,  // { outputCol: [inputCol1, inputCol2, ...] }
+  options?: {
+    dropna?: boolean,  // drop rows with null/NaN values (default: true)
+  }
+): DataFrame
+

All input columns not mentioned in groups + become identity (id) columns and are repeated for each block. All group lists must + have the same length k; the result has nRows Γ— k rows + (before applying dropna).

+
+ + + + + diff --git a/playground/multivariate.html b/playground/multivariate.html new file mode 100644 index 00000000..3832d16a --- /dev/null +++ b/playground/multivariate.html @@ -0,0 +1,476 @@ + + + + + + Multivariate Analysis β€” tsb playground + + + +

Multivariate Analysis tsb

+

+ Multivariate statistics: Mahalanobis distance and + Principal Component Analysis (PCA) β€” mirroring + scipy.spatial.distance.mahalanobis and + sklearn.decomposition.PCA. +

+ + +

Mahalanobis Distance

+

+ Measures distance between two points accounting for correlations in the data. + When the inverse covariance (VI) is the identity matrix, it reduces to Euclidean distance. +

+
d = √( (uβˆ’v)α΅€ Β· VI Β· (uβˆ’v) ) where VI = Σ⁻¹
+ +
+ + + + + + + + + +
Press "Compute" to see results.
+
+ +
+ + + + + +
Press to see results.
+
+ + +

Principal Component Analysis (PCA)

+

+ Reduces dimensionality by projecting data onto the directions of maximum variance. + Each row of the input matrix X is one observation; each column is a feature. +

+ +
+ + + + + +
Press "Run PCA" to see results.
+
+ +
+ + + + + +
Press to see results.
+
+ +
+ + + +
Press to see results.
+
+ + + + + + diff --git a/playground/parquet.html b/playground/parquet.html new file mode 100644 index 00000000..31f1b09b --- /dev/null +++ b/playground/parquet.html @@ -0,0 +1,361 @@ + + + + + + tsb β€” readParquet & toParquet + + + +
+
+
Initializing playground…
+
+ + ← Back to roadmap + +

πŸ“¦ Apache Parquet I/O

+

+ readParquet(data, options?) and toParquet(df, options?) + implement a pure-TypeScript Apache Parquet reader and writer with no native dependencies. + The implementation uses the Thrift compact protocol for metadata and PLAIN encoding for + column data pages. +

+ +
+ Supported physical types: INT32, INT64, + DOUBLE, BOOLEAN, BYTE_ARRAY (UTF-8 strings). + Compression: UNCOMPRESSED. Flat tables only (no nested or repeated fields). + Equivalent to pandas.read_parquet() / DataFrame.to_parquet(). +
+ + +
+

1 Β· Basic read & write

+

Serialize a DataFrame to a binary Parquet buffer with + toParquet() and read it back with readParquet(). + The buffer starts and ends with the PAR1 magic bytes.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Column types β€” int, float, boolean, string

+

All major column types round-trip correctly. Integers use INT32 or INT64, + floats use DOUBLE, booleans are bit-packed (1 byte per 8 values), + and strings are BYTE_ARRAY (UTF-8).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· usecols & nRows β€” selective reads

+

Use usecols to read a subset of columns and nRows + to limit the number of rows. Both options reduce memory usage and speed up parsing.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· indexCol β€” row index from a column

+

Promote any column to the DataFrame's row index by passing indexCol + to readParquet(). Use writeIndex: true in toParquet() + to persist the index as __index_level_0__.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· Unicode strings

+

BYTE_ARRAY columns are length-prefixed UTF-8. Any Unicode string β€” including + emoji, CJK characters, and accented letters β€” round-trips exactly.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· Many columns β€” stress test

+

Each column is stored as a separate column chunk in the row group. + There is no limit on column count.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + + + + diff --git a/playground/read_table.html b/playground/read_table.html new file mode 100644 index 00000000..550913b8 --- /dev/null +++ b/playground/read_table.html @@ -0,0 +1,367 @@ + + + + + + tsb β€” readTable + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“‹ readTable β€” Interactive Playground

+

+ Parse delimiter-separated text into a DataFrame + with readTable(). Mirrors + pandas + read_table() β€” identical to readCsv() but defaults + to a tab (\t) separator.
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic tab-separated file

+

By default readTable() splits on tabs, infers column dtypes, + and returns a DataFrame.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· Custom separator

+

Pass sep to use any delimiter β€” pipe, semicolon, or + multi-character strings.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· Handling missing values

+

readTable() recognises common NA strings (NA, + N/A, null, …) and converts them to + NaN. Extend the list with naValues.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· Index column, row limits & skip rows

+

Use indexCol to promote a column to the row index. + nRows caps the number of data rows read; skipRows + skips rows after the header.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

Parse a delimiter-separated text string into a DataFrame. + Defaults to tab (\t) unlike readCsv which uses + a comma.

+
readTable(text: string, options?: ReadTableOptions): DataFrame
+
+interface ReadTableOptions {
+  sep?:      string;                     // separator (default: "\t")
+  header?:   number | null;              // header row index (default: 0)
+  indexCol?: string | number | null;     // column to use as row index
+  dtype?:    Record<string, DtypeName>; // force dtype for named columns
+  naValues?: readonly string[];          // extra NA string values
+  skipRows?: number;                     // data rows to skip after header
+  nRows?:    number;                     // maximum data rows to read
+}
+
+ + + + + diff --git a/playground/regression.html b/playground/regression.html new file mode 100644 index 00000000..5e96706b --- /dev/null +++ b/playground/regression.html @@ -0,0 +1,236 @@ + + + + + + tsb β€” Regression (linregress, polyfit, OLS) + + + + +

πŸ“ Regression β€” linregress, polyfit, polyval, OLS

+

+ Linear and polynomial regression from scratch β€” mirrors + scipy.stats.linregress, numpy.polyfit, + and statsmodels.OLS. + ← back to index +

+ + +

1. Simple Linear Regression β€” linregress(x, y)

+

+ Fits y = slopeΒ·x + intercept by OLS and returns the slope, intercept, + Pearson r, two-tailed p-value, and standard errors. +

+
import { linregress } from "tsb";
+
+const x = [1, 2, 3, 4, 5];
+const y = [2, 4, 5, 4, 5];
+
+const r = linregress(x, y);
+console.log("slope    :", r.slope.toFixed(4));
+console.log("intercept:", r.intercept.toFixed(4));
+console.log("r        :", r.rvalue.toFixed(4));
+console.log("p-value  :", r.pvalue.toFixed(4));
+console.log("stderr   :", r.stderr.toFixed(4));
+
+ +
Output will appear here…
+ + +

2. Polynomial Fitting β€” polyfit(x, y, deg) and polyval(coefs, x)

+

+ Fit a polynomial of any degree to data and evaluate it at new points. +

+
import { polyfit, polyval } from "tsb";
+
+const x = [0, 1, 2, 3, 4, 5];
+const y = x.map(v => v * v);          // y = xΒ²
+
+const coefs = polyfit(x, y, 2);       // degree-2 fit
+console.log("Coefficients (highest first):", coefs.map(c => c.toFixed(4)));
+
+// Evaluate at new points
+const xNew = [6, 7, 8];
+const yNew = polyval(coefs, xNew);
+console.log("Predicted at x=6,7,8:", yNew.map(v => v.toFixed(1)));
+
+ +
Output will appear here…
+ + +

3. Multiple OLS Regression β€” new OLS().fit(X, y)

+

+ Fit a multiple linear regression model: y = β₁x₁ + Ξ²β‚‚xβ‚‚ + Ξ²β‚€. + Returns coefficients, standard errors, t-statistics, p-values, RΒ², + and an F-test. +

+
import { OLS } from "tsb";
+
+// y = 2Β·x₁ + 3Β·xβ‚‚ + 1  (exact)
+const X = [
+  [1, 0], [2, 1], [3, 2], [4, 3],
+  [5, 4], [6, 5], [7, 6], [8, 7],
+];
+const y = X.map(([a, b]) => 2 * a + 3 * b + 1);
+
+const model = new OLS();
+const result = model.fit(X, y);
+
+console.log("params:", result.params.map(v => v.toFixed(4)));
+console.log("RΒ²    :", result.rsquared.toFixed(6));
+console.log("F-stat:", result.fvalue.toFixed(4));
+console.log();
+console.log(result.summary());
+
+ +
Output will appear here…
+ + +

4. Prediction β€” result.predict(newX)

+

+ Use the fitted model to predict responses for new predictor values. +

+
import { OLS, linregress } from "tsb";
+
+// Fit simple model
+const X = [[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]];
+const y = X.map(([xi]) => 1.5 * xi + 0.5 + (Math.random() - 0.5) * 0.5);
+
+const result = new OLS().fit(X, y);
+console.log("slope (approx 1.5)    :", result.params[0].toFixed(3));
+console.log("intercept (approx 0.5):", result.params[1].toFixed(3));
+console.log("RΒ²                    :", result.rsquared.toFixed(4));
+
+// Predict at x = 11, 12, 13
+const preds = result.predict([[11],[12],[13]]);
+console.log("Predictions at x=11,12,13:", preds.map(v => v.toFixed(2)));
+
+ +
Output will appear here…
+ + + + diff --git a/playground/sas.html b/playground/sas.html new file mode 100644 index 00000000..760d3196 --- /dev/null +++ b/playground/sas.html @@ -0,0 +1,91 @@ + + + + + + tsb β€” readSas (SAS XPORT reader) + + + + + +

readSas β€” SAS XPORT reader

+

+ readSas(data) reads a SAS XPORT v5 (.xpt) file and returns a + DataFrame. SAS XPORT is a portable format widely used by the US FDA and CDC for + data submissions. +

+ +

Supported features

+
    +
  • SAS XPORT Version 5 (.xpt files)
  • +
  • Numeric variables (IBM 370 hex double-precision floating point)
  • +
  • Character variables (fixed-width ASCII strings)
  • +
  • Missing numeric values β†’ null
  • +
  • Optional index column via options.index
  • +
+ +

Basic usage

+
import { readSas } from "tsb";
+import { readFileSync } from "node:fs";
+
+// Load from disk
+const buf = new Uint8Array(readFileSync("data.xpt").buffer);
+const df = readSas(buf);
+df.head();
+
+// With index column
+const df2 = readSas(buf, { index: "SUBJID" });
+
+ +

Options

+ + + + + + + + + + + + + + + + + +
OptionTypeDefaultDescription
indexstring | nullnullColumn to use as the DataFrame index. null = default integer index.
+ +

IBM 370 floating-point

+

+ SAS XPORT stores numeric values as IBM System/370 hexadecimal double-precision floating-point + numbers. This is different from IEEE 754 (which JavaScript and most modern systems + use). readSas automatically converts IBM 370 doubles to IEEE 754. +

+
// IBM 370 double format:
+// Byte 0: [sign (1 bit)][exponent (7 bits, excess-64, base-16)]
+// Bytes 1–7: [56-bit mantissa (hexadecimal fraction)]
+// value = (-1)^sign Γ— 16^(expβˆ’64) Γ— mantissa / 2^56
+
+ +

Missing values

+

+ SAS encodes missing numeric values using a special first-byte: 0x2e + ('.') for the standard missing value, and 0x41–0x5A + (A–Z) for special missings. readSas maps all of these to + null. +

+ +

Related

+ + + diff --git a/playground/sparse.html b/playground/sparse.html new file mode 100644 index 00000000..3de58b1b --- /dev/null +++ b/playground/sparse.html @@ -0,0 +1,448 @@ + + + + + + tsb β€” SparseArray & SparseDtype + + + +
← tsb playground
+
+

πŸ•³οΈ SparseArray & SparseDtype

+

Memory-efficient storage for arrays where most values share a common fill value. Mirrors pandas.arrays.SparseArray and pandas.SparseDtype.

+ βœ… Complete + +

Overview

+

+ A SparseArray stores only the non-fill values and their positions. + When most elements share a common value β€” zeros in a sparse matrix, NaN in sensor data with + many gaps, or false in a boolean feature array β€” sparse storage dramatically reduces memory use. +

+

+ The fill_value is the implicit value for all positions not explicitly stored. + Common choices are 0 (numeric zero), NaN (missing values), or + false (boolean). By default tsb uses NaN (matching pandas behaviour). +

+ +
+ πŸ’‘ When to use SparseArray: when density < ~0.25 (fewer than 25% of values + are non-fill). Below that threshold, sparse storage saves memory and the bookkeeping overhead + is worth it. +
+ +

Quick Start

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ +

Interactive Demo

+

Enter a comma-separated list of numbers and choose a fill value to see how SparseArray stores your data.

+ + + + + + + +
+ +

API Reference

+ +

SparseArray.fromDense(data, fill_value?, subtype?)

+

Create a SparseArray from a dense array. Values equal to fill_value are not stored.

+ +

SparseArray.fromSparse(length, indices, values, fill_value?, subtype?)

+

Create a SparseArray directly from COO (Coordinate) sparse components.

+ +

Properties

+ + + + + + + + + +
PropertyTypeDescription
lengthnumberTotal logical length (including fill positions)
npointsnumberNumber of explicitly stored (non-fill) values
densitynumberFraction stored: npoints / length (0–1)
fill_valuenumberImplicit value for positions not stored
sp_valuesnumber[]Array of stored (non-fill) values
sp_indexnumber[]Positions (0-based) of stored values
dtypeSparseDtypeDescribes element type and fill value
+ +

Methods

+ + + + + + + + + + + + + + + +
MethodDescription
at(i)Value at index i (fill_value for fill positions)
toDense()Convert to a regular number[] array
toCoo()Return {indices, values} COO representation
fillna(value)Replace NaN values; returns new SparseArray
withFillValue(v)Change fill value; returns new SparseArray
slice(start, end?)Slice to [start, end); returns new SparseArray
add(scalar)Add a scalar to all values; returns new SparseArray
mul(scalar)Multiply by a scalar; returns new SparseArray
sum()Sum of all values (NaN-skipped)
mean()Mean of all non-NaN values
max()Maximum value (NaN-ignored)
min()Minimum value (NaN-ignored)
std(ddof?)Standard deviation (default ddof=1)
+ +

Use Cases

+ +

Sensor data with gaps

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ +

Feature matrix (recommendation systems)

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ +

Sparse boolean flags

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+ + + + + diff --git a/playground/sql.html b/playground/sql.html new file mode 100644 index 00000000..8c28d1f6 --- /dev/null +++ b/playground/sql.html @@ -0,0 +1,476 @@ + + + + + + tsb β€” SQL I/O + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ—ƒοΈ SQL I/O β€” Interactive Playground

+

+ readSql, readSqlQuery, readSqlTable, and toSql + mirror pandas + read_sql() and + DataFrame.to_sql(). + Because tsb has zero runtime dependencies, you pass + a SqlConnection adapter for your database driver. + Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· readSqlQuery β€” run a SELECT statement

+

Pass a SQL string and a SqlConnection adapter. The result is a + DataFrame. An optional indexCol promotes a column to the row + index.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

2 Β· readSqlTable β€” load an entire table

+

Pass a table name (not a SQL string). Use columns to select a subset, + or indexCol to set the row index.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

3 Β· readSql β€” auto-detect query vs table name

+

readSql inspects the first argument: if it looks like a SQL statement + it calls readSqlQuery; otherwise it calls readSqlTable.

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

4 Β· toSql β€” write a DataFrame to a SQL table

+

Writes rows from a DataFrame into the database. Returns the number of + rows written. The ifExists option controls what happens when the table + already exists: "fail", "replace", or + "append".

+
+
+ TypeScript +
+ + +
+
+ + +
Click β–Ά Run to execute
+
Ctrl+Enter to run Β· Tab to indent
+
+
+ + +
+

API Reference

+

All four functions accept a SqlConnection adapter β€” implement + query() plus optional listTables() and insert() + for your database driver.

+
interface SqlConnection {
+  query(sql: string, params?: readonly SqlValue[]): SqlResult;
+  listTables?(): string[];
+  insert?(table: string, rows: object[], columns: string[], ifExists: IfExistsOption): number;
+}
+
+readSqlQuery(sql: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+readSqlTable(table: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+readSql(sqlOrTable: string, con: SqlConnection, options?: ReadSqlOptions): DataFrame
+toSql(df: DataFrame, name: string, con: SqlConnection, options?: ToSqlOptions): number
+
+interface ReadSqlOptions {
+  indexCol?: string | string[];
+  columns?:  string[];
+  params?:   readonly SqlValue[];
+  parseDates?: string[];
+}
+
+interface ToSqlOptions {
+  ifExists?: "fail" | "replace" | "append";  // default: "fail"
+  index?:    boolean;                          // include index column (default: true)
+  chunkSize?: number;
+}
+
+ + + + + diff --git a/playground/stata.html b/playground/stata.html new file mode 100644 index 00000000..18743f45 --- /dev/null +++ b/playground/stata.html @@ -0,0 +1,379 @@ + + + + + + tsb β€” readStata & toStata + + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“Š readStata & toStata β€” Interactive Playground

+

Read and write Stata DTA files from TypeScript. + toStata(df) serializes a DataFrame to a Stata DTA v118 binary buffer. + readStata(buf, options) parses the buffer back into a DataFrame. + Numeric missing values are represented as null. Mirrors + pandas.read_stata() and DataFrame.to_stata().
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic round-trip β€” write and read back

+

Create a DataFrame, serialize it to a Stata DTA v118 binary buffer with + toStata(), then parse it back with readStata(). + All columns, values, and shape are preserved.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Missing values β€” null round-trip

+

Stata represents missing numeric values as special sentinel bit patterns. + readStata maps all missing sentinels to null. + toStata writes the standard Stata system-missing value for each type.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· Options β€” dataLabel & variableLabels

+

Embed a dataset description with dataLabel and per-column annotations + with variableLabels. These metadata fields are stored in the DTA header + and are visible in Stata's describe command.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· Options β€” usecols, nRows, indexCol

+

Restrict columns with usecols, limit rows with nRows, + and promote a column to the DataFrame index with indexCol.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· Boolean columns

+

Boolean values are stored as Stata byte (int8) with + true β†’ 1 and false β†’ 0. Reading converts + them back to numbers; use .map() or comparison operators + to recover booleans if needed.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· writeIndex β€” include the row index

+

Pass writeIndex: true to include the DataFrame's row index + as an extra _index column in the DTA file.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + diff --git a/playground/xml.html b/playground/xml.html new file mode 100644 index 00000000..3d70057a --- /dev/null +++ b/playground/xml.html @@ -0,0 +1,463 @@ + + + + + + tsb β€” readXml & toXml + + + +
+
+
Initializing playground…
+
+ ← Back to roadmap +

πŸ“„ readXml & toXml β€” Interactive Playground

+

Parse XML text into a DataFrame with + auto-detection of row elements, attribute and child-element columns, entity decoding, + CDATA support, namespace stripping, and numeric coercion. Serialize any DataFrame + back to well-formed XML with full formatting control. Mirrors + pandas.read_xml() and pandas.DataFrame.to_xml().
+ Edit any code block below and press β–Ά Run + (or Ctrl+Enter) to execute it live in your browser. +

+ + +
+

1 Β· Basic readXml β€” child-element rows

+

The most common XML layout: a root element containing repeating row elements, + each with child elements as columns. readXml auto-detects the row + tag and coerces numeric strings automatically.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

2 Β· Attribute rows

+

XML elements can carry data as attributes instead of (or in addition to) child + elements. Use attribs: true (the default) to include them.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

3 Β· usecols, nrows, indexCol

+

Restrict the columns returned with usecols, limit rows with + nrows, and promote a column to the index with indexCol.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

4 Β· naValues β€” custom NA strings

+

Built-in NA strings include "", "NA", "NaN", + "N/A", "null", "None", "nan". + Use naValues to add your own.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

5 Β· Entities & CDATA

+

Named entities (&amp;, &lt;, …), decimal/hex + character references (&#65;, &#x41;), and + CDATA sections (<![CDATA[…]]>) are all handled transparently.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

6 Β· toXml β€” child elements (default)

+

toXml(df) produces a well-formed XML document with an XML declaration, + a configurable root element, and one child element per row containing one sub-element + per column.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

7 Β· toXml β€” attribs mode

+

Set attribs: true to emit column values as XML attributes on each + row element instead of as child elements β€” produces more compact output.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

8 Β· toXml β€” namespaces & CDATA columns

+

Declare XML namespace prefixes on the root element with namespaces. + Wrap sensitive columns in CDATA sections with cdataCols to preserve + special characters literally.

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + +
+

9 Β· Round-trip: toXml β†’ readXml

+

Serializing a DataFrame to XML and reading it back should produce an identical + DataFrame (shape and values).

+
+
+ TypeScript +
+ + +
+
+ +
Click β–Ά Run to execute
+
+
+ + + + + + diff --git a/src/core/arrays/boolean_array.ts b/src/core/arrays/boolean_array.ts new file mode 100644 index 00000000..bee245b8 --- /dev/null +++ b/src/core/arrays/boolean_array.ts @@ -0,0 +1,224 @@ +/** + * BooleanArray β€” nullable boolean extension array. + * + * Mirrors `pandas.arrays.BooleanArray`. Stores boolean values with a separate + * mask for missing (NA) values, enabling three-valued logic (True / False / NA). + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.BooleanArray.from([true, null, false]); + * a.dtype; // "boolean" + * a.at(1); // null + * a.any(); // true + * a.all(); // false + * a.fillna(false).toArray(); // [true, false, false] + * ``` + * + * @module + */ + +import { MaskedArray } from "./masked_array.ts"; + +// ─── BooleanArray ───────────────────────────────────────────────────────────── + +/** + * A nullable boolean array. + * + * Use {@link BooleanArray.from} to create instances. + */ +export class BooleanArray extends MaskedArray { + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link BooleanArray} from a sequence of boolean (or null/undefined). + * + * @example + * ```ts + * BooleanArray.from([true, false, null, true]); + * ``` + */ + static from(values: Iterable): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(false); + mask.push(true); + } else { + data.push(Boolean(v)); + mask.push(false); + } + } + return new BooleanArray(data, mask); + } + + /** @internal */ + static _fromRaw(data: boolean[], mask: boolean[]): BooleanArray { + return new BooleanArray(data, mask); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): "boolean" { + return "boolean"; + } + + // ─── Reductions ─────────────────────────────────────────────────────────── + + /** + * Return `true` if any non-NA element is `true`. + * Returns `null` if all elements are NA and `skipna` is `false`. + */ + any(skipna = true): boolean | null { + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + if (this._data[i]) { + return true; + } + } + return false; + } + + /** + * Return `true` if all non-NA elements are `true`. + * Returns `null` if all elements are NA and `skipna` is `false`. + */ + all(skipna = true): boolean | null { + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + if (!this._data[i]) { + return false; + } + } + return true; + } + + /** Count of `true` (non-NA) elements. */ + sum(skipna = true): number | null { + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + if (this._data[i]) { + count++; + } + } + return count; + } + + // ─── Logical operations ─────────────────────────────────────────────────── + + /** + * Element-wise logical AND. + * + * Follows Kleene three-valued logic: + * - `false AND NA` β†’ `false` + * - `true AND NA` β†’ `NA` + */ + and(other: BooleanArray): BooleanArray { + if (other.size !== this.size) { + throw new RangeError(`BooleanArray: operand size mismatch (${this.size} vs ${other.size})`); + } + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + const am = this._mask[i] === true; + const bm = other._mask[i] === true; + const av = this._data[i] === true; + const bv = other._data[i] === true; + if (!(am || bm)) { + // Both known + data.push(av && bv); + mask.push(false); + } else if (!(am || av)) { + // a is false β†’ false AND anything = false + data.push(false); + mask.push(false); + } else if (bm || bv) { + // Result is NA + data.push(false); + mask.push(true); + } else { + // b is false β†’ anything AND false = false + data.push(false); + mask.push(false); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Element-wise logical OR. + * + * Follows Kleene three-valued logic: + * - `true OR NA` β†’ `true` + * - `false OR NA` β†’ `NA` + */ + or(other: BooleanArray): BooleanArray { + if (other.size !== this.size) { + throw new RangeError(`BooleanArray: operand size mismatch (${this.size} vs ${other.size})`); + } + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + const am = this._mask[i] === true; + const bm = other._mask[i] === true; + const av = this._data[i] === true; + const bv = other._data[i] === true; + if (!(am || bm)) { + // Both known + data.push(av || bv); + mask.push(false); + } else if (!am && av) { + // a is true β†’ true OR anything = true + data.push(true); + mask.push(false); + } else if (!bm && bv) { + // b is true β†’ anything OR true = true + data.push(true); + mask.push(false); + } else { + // Result is NA + data.push(false); + mask.push(true); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Element-wise logical NOT. + * `NOT NA` β†’ `NA`; `NOT true` β†’ `false`; `NOT false` β†’ `true`. + */ + not(): BooleanArray { + const data = this._data.map((v, i) => (this._mask[i] ? false : !v)); + return BooleanArray._fromRaw(data, this._mask.slice()); + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** + * Return a new {@link BooleanArray} with NAs replaced by `value`. + */ + fillna(value: boolean): BooleanArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return BooleanArray._fromRaw(data, mask); + } +} diff --git a/src/core/arrays/datetime_array.ts b/src/core/arrays/datetime_array.ts new file mode 100644 index 00000000..df0d808d --- /dev/null +++ b/src/core/arrays/datetime_array.ts @@ -0,0 +1,276 @@ +/** + * DatetimeArray β€” extension array of nullable {@link Timestamp} values. + * + * Mirrors `pandas.arrays.DatetimeArray`. Stores an array of Timestamps (with + * optional timezone) with a separate boolean mask for missing (NA) values. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * import { Timestamp } from "tsb"; + * + * const a = arrays.DatetimeArray.from([ + * new Timestamp("2024-01-01"), + * null, + * new Timestamp("2024-03-15"), + * ]); + * a.dtype; // "datetime64[ns]" + * a.at(1); // null + * a.year; // [2024, null, 2024] + * a.month; // [1, null, 3] + * ``` + * + * @module + */ + +import { Timestamp } from "../timestamp.ts"; +import type { TimestampOptions } from "../timestamp.ts"; + +// ─── DatetimeArray ──────────────────────────────────────────────────────────── + +/** + * A nullable array of {@link Timestamp} values. + * + * Use {@link DatetimeArray.from} to create instances. + */ +export class DatetimeArray { + private readonly _data: Timestamp[]; + private readonly _mask: boolean[]; + private readonly _tz: string | null; + + /** @internal */ + constructor(data: Timestamp[], mask: boolean[], tz: string | null = null) { + if (data.length !== mask.length) { + throw new RangeError( + `DatetimeArray: data length (${data.length}) !== mask length (${mask.length})`, + ); + } + this._data = data; + this._mask = mask; + this._tz = tz; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link DatetimeArray} from a sequence of Timestamps, strings, or numbers. + * + * @param values - Each element may be a {@link Timestamp}, an ISO string + * (e.g. `"2024-01-01"`), a millisecond-since-epoch number, a JS `Date`, + * `null`, or `undefined`. + * @param options - Options forwarded to the {@link Timestamp} constructor for + * non-Timestamp inputs (e.g. `{ unit: "s", tz: "UTC" }`). + * + * @example + * ```ts + * DatetimeArray.from(["2024-01-01", null, "2024-03-15"]); + * DatetimeArray.from([1704067200000, null], { unit: "ms" }); + * ``` + */ + static from( + values: Iterable, + options?: Readonly, + ): DatetimeArray { + const data: Timestamp[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(new Timestamp(0)); + mask.push(true); + } else if (v instanceof Timestamp) { + data.push(v); + mask.push(false); + } else { + data.push(new Timestamp(v as string | number | Date, options)); + mask.push(false); + } + } + const tz = options?.tz ?? null; + return new DatetimeArray(data, mask, typeof tz === "string" ? tz : null); + } + + /** @internal */ + static _fromRaw(data: Timestamp[], mask: boolean[], tz: string | null = null): DatetimeArray { + return new DatetimeArray(data, mask, tz); + } + + // ─── Core accessors ──────────────────────────────────────────────────────── + + /** Number of elements (including NAs). */ + get size(): number { + return this._data.length; + } + + /** Dtype string β€” mirrors pandas `datetime64[ns]` or `datetime64[ns, tz]`. */ + get dtype(): string { + return this._tz ? `datetime64[ns, ${this._tz}]` : "datetime64[ns]"; + } + + /** IANA timezone, or `null` for timezone-naive arrays. */ + get tz(): string | null { + return this._tz; + } + + /** + * Return the element at index `i`, or `null` if masked. + * Supports negative indexing. + */ + at(i: number): Timestamp | null { + const idx = i < 0 ? this._data.length + i : i; + if (idx < 0 || idx >= this._data.length) { + return null; + } + if (this._mask[idx]) { + return null; + } + return this._data[idx] ?? null; + } + + // ─── NA ──────────────────────────────────────────────────────────────────── + + /** Boolean array where `true` = NA. */ + isna(): boolean[] { + return this._mask.slice(); + } + + /** Boolean array where `true` = not NA. */ + notna(): boolean[] { + return this._mask.map((m) => !m); + } + + // ─── Component accessors ────────────────────────────────────────────────── + + /** Numeric year for each element (NA β†’ null). */ + get year(): (number | null)[] { + return this._extractComponent((ts) => ts.year); + } + + /** Month (1–12) for each element (NA β†’ null). */ + get month(): (number | null)[] { + return this._extractComponent((ts) => ts.month); + } + + /** Day (1–31) for each element (NA β†’ null). */ + get day(): (number | null)[] { + return this._extractComponent((ts) => ts.day); + } + + /** Hour (0–23) for each element (NA β†’ null). */ + get hour(): (number | null)[] { + return this._extractComponent((ts) => ts.hour); + } + + /** Minute (0–59) for each element (NA β†’ null). */ + get minute(): (number | null)[] { + return this._extractComponent((ts) => ts.minute); + } + + /** Second (0–59) for each element (NA β†’ null). */ + get second(): (number | null)[] { + return this._extractComponent((ts) => ts.second); + } + + /** Millisecond (0–999) for each element (NA β†’ null). */ + get millisecond(): (number | null)[] { + return this._extractComponent((ts) => ts.millisecond); + } + + /** Day of week (0=Monday … 6=Sunday) for each element (NA β†’ null). */ + get dayofweek(): (number | null)[] { + return this._extractComponent((ts) => ts.dayofweek); + } + + /** Day of year (1–366) for each element (NA β†’ null). */ + get dayofyear(): (number | null)[] { + return this._extractComponent((ts) => ts.dayofyear); + } + + /** Quarter (1–4) for each element (NA β†’ null). */ + get quarter(): (number | null)[] { + return this._extractComponent((ts) => ts.quarter); + } + + // ─── Conversion ──────────────────────────────────────────────────────────── + + /** Return an array of {@link Timestamp} or `null` for NA positions. */ + toArray(): (Timestamp | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v)); + } + + /** Milliseconds since epoch for each element (NA β†’ null). */ + asMs(): (number | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v._utcMs)); + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** Return a new DatetimeArray with NAs replaced by `value`. */ + fillna(value: Timestamp): DatetimeArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return DatetimeArray._fromRaw(data, mask, this._tz); + } + + // ─── Min / Max ───────────────────────────────────────────────────────────── + + /** Earliest (minimum) non-NA Timestamp, or `null` if all are NA. */ + min(): Timestamp | null { + let result: Timestamp | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timestamp; + if (result === null || v._utcMs < result._utcMs) { + result = v; + } + } + return result; + } + + /** Latest (maximum) non-NA Timestamp, or `null` if all are NA. */ + max(): Timestamp | null { + let result: Timestamp | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timestamp; + if (result === null || v._utcMs > result._utcMs) { + result = v; + } + } + return result; + } + + // ─── Iteration ───────────────────────────────────────────────────────────── + + [Symbol.iterator](): Iterator { + let i = 0; + const data = this._data; + const mask = this._mask; + return { + next() { + if (i >= data.length) { + return { value: null, done: true }; + } + const value = mask[i] ? null : (data[i] ?? null); + i++; + return { value, done: false }; + }, + }; + } + + // ─── String representation ───────────────────────────────────────────────── + + toString(): string { + const items = this.toArray().map((v) => (v === null ? "" : v.isoformat())); + return `DatetimeArray([${items.join(", ")}], dtype="${this.dtype}")`; + } + + // ─── Private helper ──────────────────────────────────────────────────────── + + private _extractComponent(fn: (ts: Timestamp) => number): (number | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : fn(v))); + } +} diff --git a/src/core/arrays/floating_array.ts b/src/core/arrays/floating_array.ts new file mode 100644 index 00000000..7e20f039 --- /dev/null +++ b/src/core/arrays/floating_array.ts @@ -0,0 +1,284 @@ +/** + * FloatingArray β€” nullable floating-point extension array. + * + * Mirrors `pandas.arrays.FloatingArray`. Stores float values with a separate + * boolean mask for missing (NA) values. Supports `Float32` and `Float64` + * (capital-F nullable variants). + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.FloatingArray.from([1.5, null, 3.14], "Float64"); + * a.dtype; // "Float64" + * a.size; // 3 + * a.at(1); // null + * a.sum(); // 4.64 + * a.fillna(0).toArray(); // [1.5, 0, 3.14] + * ``` + * + * @module + */ + +import { MaskedArray } from "./masked_array.ts"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +/** + * Nullable float dtype names. + */ +export type FloatingDtypeName = "Float32" | "Float64"; + +// ─── FloatingArray ──────────────────────────────────────────────────────────── + +/** + * A nullable floating-point array. + * + * Use {@link FloatingArray.from} to create instances. + */ +export class FloatingArray extends MaskedArray { + private readonly _dtype: FloatingDtypeName; + + /** @internal */ + constructor(data: number[], mask: boolean[], dtype: FloatingDtypeName) { + super(data, mask); + this._dtype = dtype; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link FloatingArray} from a sequence of values. + * + * @param values - Source values. `null`, `undefined`, and `NaN` become NA. + * @param dtype - Target dtype. Defaults to `"Float64"`. + * + * @example + * ```ts + * FloatingArray.from([1.1, 2.2, null, 4.4]); // Float64 + * FloatingArray.from([1.1, NaN, 3.3], "Float32"); // Float32 + * ``` + */ + static from( + values: Iterable, + dtype: FloatingDtypeName = "Float64", + ): FloatingArray { + if (dtype !== "Float32" && dtype !== "Float64") { + throw new TypeError(`FloatingArray: unknown dtype "${dtype}"`); + } + const data: number[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined || (typeof v === "number" && Number.isNaN(v))) { + data.push(0); + mask.push(true); + } else { + data.push(dtype === "Float32" ? Math.fround(v) : v); + mask.push(false); + } + } + return new FloatingArray(data, mask, dtype); + } + + /** @internal */ + static _fromRaw(data: number[], mask: boolean[], dtype: FloatingDtypeName): FloatingArray { + return new FloatingArray(data, mask, dtype); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): FloatingDtypeName { + return this._dtype; + } + + // ─── Operations ─────────────────────────────────────────────────────────── + + /** Sum of non-NA elements. */ + sum(skipna = true): number | null { + let total = 0; + let hasNonNa = false; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + hasNonNa = true; + } + return hasNonNa || skipna ? total : null; + } + + /** Mean of non-NA elements. */ + mean(skipna = true): number | null { + let total = 0; + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + count++; + } + return count > 0 ? total / count : null; + } + + /** Minimum non-NA element. */ + min(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v < result) { + result = v; + } + } + return result; + } + + /** Maximum non-NA element. */ + max(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v > result) { + result = v; + } + } + return result; + } + + /** Number of non-NA elements. */ + count(): number { + return this._mask.filter((m) => !m).length; + } + + /** Standard deviation of non-NA elements (sample, ddof=1). */ + std(skipna = true, ddof = 1): number | null { + const m = this.mean(skipna); + if (m === null) { + return null; + } + let sumSq = 0; + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const d = (this._data[i] as number) - m; + sumSq += d * d; + count++; + } + return count > ddof ? Math.sqrt(sumSq / (count - ddof)) : null; + } + + // ─── Element-wise arithmetic ────────────────────────────────────────────── + + /** Element-wise addition. NA propagates. */ + add(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a + b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise subtraction. NA propagates. */ + sub(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a - b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise multiplication. NA propagates. */ + mul(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a * b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise division. NA propagates. Division by zero β†’ Β±Infinity (masked). */ + truediv(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a / b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise exponentiation. NA propagates. */ + pow(other: FloatingArray | number): FloatingArray { + const [data, mask] = this._binop(other, (a, b) => a ** b); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + /** @internal */ + private _binop( + other: FloatingArray | number, + fn: (a: number, b: number) => number, + ): [number[], boolean[]] { + if (typeof other === "number") { + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other)); + mask.push(false); + } + } + return [data, mask]; + } + if (other.size !== this.size) { + throw new RangeError(`FloatingArray: operand size mismatch (${this.size} vs ${other.size})`); + } + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other._data[i] as number)); + mask.push(false); + } + } + return [data, mask]; + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** + * Return a new {@link FloatingArray} with NAs replaced by `value`. + */ + fillna(value: number): FloatingArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return FloatingArray._fromRaw(data, mask, this._dtype); + } + + // ─── Type conversion ────────────────────────────────────────────────────── + + /** Convert to another floating dtype. */ + astype(dtype: FloatingDtypeName): FloatingArray { + if (dtype !== "Float32" && dtype !== "Float64") { + throw new TypeError(`FloatingArray.astype: unknown dtype "${dtype}"`); + } + const data = this._data.map((v, i) => { + if (this._mask[i]) { + return 0; + } + return dtype === "Float32" ? Math.fround(v) : v; + }); + return FloatingArray._fromRaw(data, this._mask.slice(), dtype); + } +} diff --git a/src/core/arrays/index.ts b/src/core/arrays/index.ts new file mode 100644 index 00000000..9dc5a01f --- /dev/null +++ b/src/core/arrays/index.ts @@ -0,0 +1,55 @@ +/** + * pd.arrays β€” Pandas-compatible typed extension arrays for tsb. + * + * Mirrors the `pandas.arrays` namespace. Provides nullable typed arrays for + * integers, floats, booleans, strings, datetimes, and timedeltas. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * // Nullable integer array + * const ints = arrays.IntegerArray.from([1, 2, null, 4], "Int32"); + * ints.toArray(); // [1, 2, null, 4] + * ints.sum(); // 7 + * + * // Nullable float array + * const floats = arrays.FloatingArray.from([1.5, null, 3.0]); + * floats.mean(); // 2.25 + * + * // Nullable boolean array (three-valued logic) + * const bools = arrays.BooleanArray.from([true, false, null]); + * bools.any(); // true + * + * // Nullable string array + * const strs = arrays.StringArray.from(["hello", null, "world"]); + * strs.upper().toArray(); // ["HELLO", null, "WORLD"] + * + * // Datetime array + * const dts = arrays.DatetimeArray.from(["2024-01-01", null]); + * dts.year; // [2024, null] + * + * // Timedelta array + * const tds = arrays.TimedeltaArray.from([86400000, null]); + * tds.days; // [1, null] + * ``` + * + * @module + */ + +export { MaskedArray } from "./masked_array.ts"; +export type { FillValue } from "./masked_array.ts"; + +export { IntegerArray } from "./integer_array.ts"; +export type { IntegerDtypeName } from "./integer_array.ts"; + +export { FloatingArray } from "./floating_array.ts"; +export type { FloatingDtypeName } from "./floating_array.ts"; + +export { BooleanArray } from "./boolean_array.ts"; + +export { StringArray } from "./string_array.ts"; + +export { DatetimeArray } from "./datetime_array.ts"; + +export { TimedeltaArray } from "./timedelta_array.ts"; diff --git a/src/core/arrays/integer_array.ts b/src/core/arrays/integer_array.ts new file mode 100644 index 00000000..240b6293 --- /dev/null +++ b/src/core/arrays/integer_array.ts @@ -0,0 +1,330 @@ +/** + * IntegerArray β€” nullable integer extension array. + * + * Mirrors `pandas.arrays.IntegerArray`. Stores integer values with a separate + * boolean mask to represent missing (NA) values. Supports all integer dtypes + * that pandas uses: `Int8`, `Int16`, `Int32`, `Int64`, `UInt8`, `UInt16`, + * `UInt32`, `UInt64` (note capital letter β€” these are the *nullable* variants + * distinct from NumPy `int8` etc.). + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.IntegerArray.from([1, null, 3, null, 5], "Int32"); + * a.dtype; // "Int32" + * a.size; // 5 + * a.at(1); // null + * a.toArray(); // [1, null, 3, null, 5] + * a.sum(); // 9 + * a.fillna(0).toArray(); // [1, 0, 3, 0, 5] + * ``` + * + * @module + */ + +import { MaskedArray } from "./masked_array.ts"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +/** + * Nullable integer dtype names (capital letter prefix = nullable in pandas). + */ +export type IntegerDtypeName = + | "Int8" + | "Int16" + | "Int32" + | "Int64" + | "UInt8" + | "UInt16" + | "UInt32" + | "UInt64"; + +const INTEGER_DTYPES = new Set([ + "Int8", + "Int16", + "Int32", + "Int64", + "UInt8", + "UInt16", + "UInt32", + "UInt64", +]); + +/** @internal */ +function isIntegerDtypeName(s: string): s is IntegerDtypeName { + return INTEGER_DTYPES.has(s as IntegerDtypeName); +} + +// ─── Bounds checking ───────────────────────────────────────────────────────── + +const BOUNDS: Record = { + Int8: [-128, 127], + Int16: [-32768, 32767], + Int32: [-2147483648, 2147483647], + Int64: [Number.MIN_SAFE_INTEGER, Number.MAX_SAFE_INTEGER], + UInt8: [0, 255], + UInt16: [0, 65535], + UInt32: [0, 4294967295], + UInt64: [0, Number.MAX_SAFE_INTEGER], +}; + +/** @internal */ +function checkBounds(value: number, dtype: IntegerDtypeName): void { + const [lo, hi] = BOUNDS[dtype]; + if (value < lo || value > hi) { + throw new RangeError(`IntegerArray(${dtype}): value ${value} out of bounds [${lo}, ${hi}]`); + } +} + +// ─── IntegerArray ───────────────────────────────────────────────────────────── + +/** + * A nullable integer array. + * + * Use {@link IntegerArray.from} to create instances. + */ +export class IntegerArray extends MaskedArray { + private readonly _dtype: IntegerDtypeName; + + /** @internal */ + constructor(data: number[], mask: boolean[], dtype: IntegerDtypeName) { + super(data, mask); + this._dtype = dtype; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create an {@link IntegerArray} from a sequence of values (or `null`/`undefined` + * for missing values) and an optional dtype. + * + * @param values - Source values. `null` and `undefined` become NA. + * @param dtype - Target dtype. Defaults to `"Int64"`. + * + * @example + * ```ts + * IntegerArray.from([1, 2, null, 4]); // Int64 + * IntegerArray.from([1, 2, null], "Int32"); // Int32 + * ``` + */ + static from( + values: Iterable, + dtype: IntegerDtypeName = "Int64", + ): IntegerArray { + if (!isIntegerDtypeName(dtype)) { + throw new TypeError(`IntegerArray: unknown dtype "${dtype}"`); + } + const data: number[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(0); + mask.push(true); + } else { + const int = Math.trunc(v); + checkBounds(int, dtype); + data.push(int); + mask.push(false); + } + } + return new IntegerArray(data, mask, dtype); + } + + /** + * Create an {@link IntegerArray} from a raw buffer (no copying, no validation). + * + * @internal + */ + static _fromRaw(data: number[], mask: boolean[], dtype: IntegerDtypeName): IntegerArray { + return new IntegerArray(data, mask, dtype); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): IntegerDtypeName { + return this._dtype; + } + + // ─── Operations ─────────────────────────────────────────────────────────── + + /** + * Sum of non-NA elements. Returns `null` if all elements are NA and + * `skipna` is `false`. + */ + sum(skipna = true): number | null { + let total = 0; + let hasNonNa = false; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + hasNonNa = true; + } + return hasNonNa || skipna ? total : null; + } + + /** Mean of non-NA elements. */ + mean(skipna = true): number | null { + let total = 0; + let count = 0; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += this._data[i] as number; + count++; + } + return count > 0 ? total / count : null; + } + + /** Minimum non-NA element. */ + min(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v < result) { + result = v; + } + } + return result; + } + + /** Maximum non-NA element. */ + max(skipna = true): number | null { + let result: number | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + const v = this._data[i] as number; + if (result === null || v > result) { + result = v; + } + } + return result; + } + + /** Number of non-NA elements. */ + count(): number { + return this._mask.filter((m) => !m).length; + } + + // ─── Element-wise arithmetic ────────────────────────────────────────────── + + /** Element-wise addition. NA propagates. */ + add(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a + b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise subtraction. NA propagates. */ + sub(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a - b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise multiplication. NA propagates. */ + mul(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a * b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise integer division. NA propagates. */ + floordiv(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => Math.trunc(a / b)); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise modulo. NA propagates. */ + mod(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => a % b); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** Element-wise exponentiation. NA propagates. */ + pow(other: IntegerArray | number): IntegerArray { + const [data, mask] = this._binop(other, (a, b) => Math.trunc(a ** b)); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + /** @internal */ + private _binop( + other: IntegerArray | number, + fn: (a: number, b: number) => number, + ): [number[], boolean[]] { + if (typeof other === "number") { + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other)); + mask.push(false); + } + } + return [data, mask]; + } + if (other.size !== this.size) { + throw new RangeError(`IntegerArray: operand size mismatch (${this.size} vs ${other.size})`); + } + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(0); + mask.push(true); + } else { + data.push(fn(this._data[i] as number, other._data[i] as number)); + mask.push(false); + } + } + return [data, mask]; + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** + * Return a new {@link IntegerArray} with NAs replaced by `value`. + */ + fillna(value: number): IntegerArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return IntegerArray._fromRaw(data, mask, this._dtype); + } + + // ─── Type conversion ────────────────────────────────────────────────────── + + /** Convert to another integer dtype. */ + astype(dtype: IntegerDtypeName): IntegerArray { + if (!isIntegerDtypeName(dtype)) { + throw new TypeError(`IntegerArray.astype: unknown dtype "${dtype}"`); + } + const data = this._data.map((v, i) => { + if (this._mask[i]) { + return 0; + } + checkBounds(v, dtype); + return v; + }); + return IntegerArray._fromRaw(data, this._mask.slice(), dtype); + } +} diff --git a/src/core/arrays/masked_array.ts b/src/core/arrays/masked_array.ts new file mode 100644 index 00000000..238082a4 --- /dev/null +++ b/src/core/arrays/masked_array.ts @@ -0,0 +1,194 @@ +/** + * MaskedArray β€” base class for nullable extension arrays. + * + * Mirrors `pandas.core.arrays.masked.BaseMaskedArray`. Stores values and a + * separate boolean mask where `true` means the element is NA (missing). + * + * All concrete nullable array types ({@link IntegerArray}, {@link FloatingArray}, + * {@link BooleanArray}) extend this class. + * + * @module + */ + +import type { Scalar } from "../../types.ts"; + +// ─── Types ──────────────────────────────────────────────────────────────────── + +/** + * Values accepted as fill value for {@link MaskedArray.fillna}. + */ +export type FillValue = T | null | undefined; + +// ─── MaskedArray ───────────────────────────────────────────────────────────── + +/** + * Abstract base class for masked (nullable) arrays. + * + * @typeParam T - The underlying element type (number, boolean, string, etc.) + * + * @example + * ```ts + * // Constructed via subclasses, e.g. IntegerArray.from([1, null, 3]) + * ``` + */ +export abstract class MaskedArray { + /** + * Stored element values. When `_mask[i]` is `true` this value is + * undefined/unused, but we always maintain the same length for both arrays. + */ + protected readonly _data: T[]; + /** + * Boolean mask where `true` indicates a missing value (NA). + */ + protected readonly _mask: boolean[]; + + /** @internal */ + constructor(data: T[], mask: boolean[]) { + if (data.length !== mask.length) { + throw new RangeError( + `MaskedArray: data length (${data.length}) !== mask length (${mask.length})`, + ); + } + this._data = data; + this._mask = mask; + } + + // ─── Core accessors ──────────────────────────────────────────────────────── + + /** Number of elements (including NAs). */ + get size(): number { + return this._data.length; + } + + /** The dtype name for this array (defined by subclasses). */ + abstract get dtype(): string; + + /** + * Return the element at index `i`, or `null` if it is masked. + * Supports negative indexing. + */ + at(i: number): T | null { + const idx = i < 0 ? this._data.length + i : i; + if (idx < 0 || idx >= this._data.length) { + return null; + } + if (this._mask[idx]) { + return null; + } + return this._data[idx] ?? null; + } + + // ─── NA / notna ──────────────────────────────────────────────────────────── + + /** + * Return a boolean array where `true` indicates a missing element. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).isna(); // [false, true, false] + * ``` + */ + isna(): boolean[] { + return this._mask.slice(); + } + + /** + * Return a boolean array where `true` indicates a non-missing element. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).notna(); // [true, false, true] + * ``` + */ + notna(): boolean[] { + return this._mask.map((m) => !m); + } + + /** `true` if any element is NA. */ + hasNa(): boolean { + return this._mask.some(Boolean); + } + + // ─── Conversion ──────────────────────────────────────────────────────────── + + /** + * Return a plain JS array where masked elements are represented as `null`. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).toArray(); // [1, null, 3] + * ``` + */ + toArray(): (T | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v)); + } + + /** + * Return a plain JS array, replacing each NA with `naValue`. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).toArray(0); // [1, 0, 3] + * ``` + */ + toArrayFilled(naValue: T): T[] { + return this._data.map((v, i) => (this._mask[i] ? naValue : v)); + } + + // ─── fillna ──────────────────────────────────────────────────────────────── + + /** + * Return a new array with NAs replaced by `value`. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).fillna(0).toArray(); // [1, 0, 3] + * ``` + */ + abstract fillna(value: T): MaskedArray; + + // ─── dropna ──────────────────────────────────────────────────────────────── + + /** + * Return the non-NA values as a plain JS array. + * + * @example + * ```ts + * IntegerArray.from([1, null, 3]).dropna(); // [1, 3] + * ``` + */ + dropna(): T[] { + const out: T[] = []; + for (let i = 0; i < this._data.length; i++) { + if (!this._mask[i]) { + out.push(this._data[i] as T); + } + } + return out; + } + + // ─── Iteration ───────────────────────────────────────────────────────────── + + [Symbol.iterator](): Iterator { + let i = 0; + const data = this._data; + const mask = this._mask; + return { + next() { + if (i >= data.length) { + return { value: null, done: true }; + } + const value = mask[i] ? null : (data[i] ?? null); + i++; + return { value, done: false }; + }, + }; + } + + // ─── String representation ───────────────────────────────────────────────── + + toString(): string { + const items = this.toArray().map((v) => (v === null ? "" : String(v))); + return `${this.dtype}([${items.join(", ")}])`; + } +} diff --git a/src/core/arrays/string_array.ts b/src/core/arrays/string_array.ts new file mode 100644 index 00000000..b354bc34 --- /dev/null +++ b/src/core/arrays/string_array.ts @@ -0,0 +1,243 @@ +/** + * StringArray β€” nullable string extension array. + * + * Mirrors `pandas.arrays.StringArray`. Stores string values with a separate + * mask for missing (NA) values. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * + * const a = arrays.StringArray.from(["hello", null, "world"]); + * a.dtype; // "string" + * a.at(1); // null + * a.upper().toArray(); // ["HELLO", null, "WORLD"] + * a.fillna("").toArray(); // ["hello", "", "world"] + * ``` + * + * @module + */ + +import { BooleanArray } from "./boolean_array.ts"; +import { IntegerArray } from "./integer_array.ts"; +import { MaskedArray } from "./masked_array.ts"; + +// ─── StringArray ────────────────────────────────────────────────────────────── + +/** + * A nullable string array. + * + * Use {@link StringArray.from} to create instances. + */ +export class StringArray extends MaskedArray { + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link StringArray} from a sequence of string values (or null/undefined). + * + * @example + * ```ts + * StringArray.from(["a", "b", null, "d"]); + * ``` + */ + static from(values: Iterable): StringArray { + const data: string[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(""); + mask.push(true); + } else { + data.push(String(v)); + mask.push(false); + } + } + return new StringArray(data, mask); + } + + /** @internal */ + static _fromRaw(data: string[], mask: boolean[]): StringArray { + return new StringArray(data, mask); + } + + // ─── Dtype ──────────────────────────────────────────────────────────────── + + get dtype(): "string" { + return "string"; + } + + // ─── String operations ──────────────────────────────────────────────────── + + /** Return a new StringArray with all strings uppercased. NA is preserved. */ + upper(): StringArray { + return this._mapStr((s) => s.toUpperCase()); + } + + /** Return a new StringArray with all strings lowercased. NA is preserved. */ + lower(): StringArray { + return this._mapStr((s) => s.toLowerCase()); + } + + /** Return a new StringArray with leading/trailing whitespace stripped. */ + strip(): StringArray { + return this._mapStr((s) => s.trim()); + } + + /** Return a new StringArray with leading whitespace stripped. */ + lstrip(): StringArray { + return this._mapStr((s) => s.trimStart()); + } + + /** Return a new StringArray with trailing whitespace stripped. */ + rstrip(): StringArray { + return this._mapStr((s) => s.trimEnd()); + } + + /** + * Return a {@link BooleanArray} where `true` if the element contains `pattern`. + * NA elements remain NA in the result. + * + * @example + * ```ts + * StringArray.from(["abc", null, "xyz"]).contains("a"); + * // BooleanArray [true, null, false] + * ``` + */ + contains(pattern: string | RegExp): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(false); + mask.push(true); + } else { + const s = this._data[i] as string; + data.push(typeof pattern === "string" ? s.includes(pattern) : pattern.test(s)); + mask.push(false); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Return a BooleanArray where `true` if the element starts with `prefix`. + */ + startswith(prefix: string): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(false); + mask.push(true); + } else { + data.push((this._data[i] as string).startsWith(prefix)); + mask.push(false); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Return a BooleanArray where `true` if the element ends with `suffix`. + */ + endswith(suffix: string): BooleanArray { + const data: boolean[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + data.push(false); + mask.push(true); + } else { + data.push((this._data[i] as string).endsWith(suffix)); + mask.push(false); + } + } + return BooleanArray._fromRaw(data, mask); + } + + /** + * Return a new StringArray with occurrences of `pat` replaced by `repl`. + */ + replace(pat: string | RegExp, repl: string): StringArray { + return this._mapStr((s) => s.replace(pat, repl)); + } + + /** Return a StringArray with strings zero-padded on the left to `width`. */ + zfill(width: number): StringArray { + return this._mapStr((s) => s.padStart(width, "0")); + } + + /** + * String length for each element as an {@link IntegerArray} (NA β†’ NA). + * + * @example + * ```ts + * StringArray.from(["hi", null, "world"]).len().toArray(); // [2, null, 5] + * ``` + */ + len(): IntegerArray { + const data: number[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + data.push(this._mask[i] ? 0 : (this._data[i] as string).length); + mask.push(this._mask[i] === true); + } + return IntegerArray._fromRaw(data, mask, "Int64"); + } + + /** + * Concatenate strings element-wise with a separator. + * + * @example + * ```ts + * StringArray.from(["a", "b"]).cat(" ", StringArray.from(["x", "y"])); + * // StringArray ["a x", "b y"] + * ``` + */ + cat(sep: string, other: StringArray): StringArray { + if (other.size !== this.size) { + throw new RangeError(`StringArray.cat: size mismatch (${this.size} vs ${other.size})`); + } + const data: string[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(""); + mask.push(true); + } else { + data.push((this._data[i] as string) + sep + (other._data[i] as string)); + mask.push(false); + } + } + return StringArray._fromRaw(data, mask); + } + + /** + * Return a new StringArray with NA elements replaced. + * + * @example + * ```ts + * StringArray.from(["a", null, "c"]).fillna("x").toArray(); + * // ["a", "x", "c"] + * ``` + */ + fillna(value: string): StringArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return StringArray._fromRaw(data, mask); + } + + // ─── Reductions ─────────────────────────────────────────────────────────── + + /** Count of non-NA elements. */ + count(): number { + return this._mask.filter((m) => !m).length; + } + + // ─── Internal helper ────────────────────────────────────────────────────── + + private _mapStr(fn: (s: string) => string): StringArray { + const data = this._data.map((v, i) => (this._mask[i] ? "" : fn(v as string))); + return StringArray._fromRaw(data, this._mask.slice()); + } +} diff --git a/src/core/arrays/timedelta_array.ts b/src/core/arrays/timedelta_array.ts new file mode 100644 index 00000000..60851c75 --- /dev/null +++ b/src/core/arrays/timedelta_array.ts @@ -0,0 +1,334 @@ +/** + * TimedeltaArray β€” extension array of nullable {@link Timedelta} values. + * + * Mirrors `pandas.arrays.TimedeltaArray`. Stores an array of Timedelta values + * with a separate boolean mask for missing (NA) values. + * + * @example + * ```ts + * import { arrays } from "tsb"; + * import { Timedelta } from "tsb"; + * + * const a = arrays.TimedeltaArray.from([ + * Timedelta.fromComponents({ days: 1 }), + * null, + * Timedelta.fromComponents({ hours: 6 }), + * ]); + * a.dtype; // "timedelta64[ns]" + * a.at(1); // null + * a.days; // [1, null, 0] + * a.totalSeconds; // [86400, null, 21600] + * ``` + * + * @module + */ + +import { Timedelta } from "../timedelta.ts"; + +// ─── TimedeltaArray ─────────────────────────────────────────────────────────── + +/** + * A nullable array of {@link Timedelta} values. + * + * Use {@link TimedeltaArray.from} to create instances. + */ +export class TimedeltaArray { + private readonly _data: Timedelta[]; + private readonly _mask: boolean[]; + + /** @internal */ + constructor(data: Timedelta[], mask: boolean[]) { + if (data.length !== mask.length) { + throw new RangeError( + `TimedeltaArray: data length (${data.length}) !== mask length (${mask.length})`, + ); + } + this._data = data; + this._mask = mask; + } + + // ─── Factory ─────────────────────────────────────────────────────────────── + + /** + * Create a {@link TimedeltaArray} from a sequence of Timedelta values, + * numbers (milliseconds), ISO strings, or null/undefined. + * + * @param values - Source values. Numbers are interpreted as milliseconds. + * ISO duration strings like `"1 days 02:00:00"` or `"P1DT2H"` are parsed. + * + * @example + * ```ts + * TimedeltaArray.from([ + * Timedelta.fromComponents({ days: 1 }), + * null, + * 86400000, // 1 day in ms + * "1 days 00:00:00", + * ]); + * ``` + */ + static from(values: Iterable): TimedeltaArray { + const data: Timedelta[] = []; + const mask: boolean[] = []; + for (const v of values) { + if (v === null || v === undefined) { + data.push(Timedelta.fromMilliseconds(0)); + mask.push(true); + } else if (v instanceof Timedelta) { + data.push(v); + mask.push(false); + } else if (typeof v === "number") { + data.push(Timedelta.fromMilliseconds(v)); + mask.push(false); + } else { + data.push(Timedelta.parse(v)); + mask.push(false); + } + } + return new TimedeltaArray(data, mask); + } + + /** @internal */ + static _fromRaw(data: Timedelta[], mask: boolean[]): TimedeltaArray { + return new TimedeltaArray(data, mask); + } + + // ─── Core accessors ──────────────────────────────────────────────────────── + + /** Number of elements (including NAs). */ + get size(): number { + return this._data.length; + } + + /** Dtype string β€” `"timedelta64[ns]"`. */ + get dtype(): "timedelta64[ns]" { + return "timedelta64[ns]"; + } + + /** + * Return the element at index `i`, or `null` if masked. + * Supports negative indexing. + */ + at(i: number): Timedelta | null { + const idx = i < 0 ? this._data.length + i : i; + if (idx < 0 || idx >= this._data.length) { + return null; + } + if (this._mask[idx]) { + return null; + } + return this._data[idx] ?? null; + } + + // ─── NA ──────────────────────────────────────────────────────────────────── + + /** Boolean array where `true` = NA. */ + isna(): boolean[] { + return this._mask.slice(); + } + + /** Boolean array where `true` = not NA. */ + notna(): boolean[] { + return this._mask.map((m) => !m); + } + + // ─── Component accessors ────────────────────────────────────────────────── + + /** Integer days component for each element (NA β†’ null). */ + get days(): (number | null)[] { + return this._extractComponent((td) => td.days); + } + + /** Integer hours component for each element (NA β†’ null). */ + get hours(): (number | null)[] { + return this._extractComponent((td) => td.hours); + } + + /** Integer minutes component for each element (NA β†’ null). */ + get minutes(): (number | null)[] { + return this._extractComponent((td) => td.minutes); + } + + /** Integer seconds component for each element (NA β†’ null). */ + get seconds(): (number | null)[] { + return this._extractComponent((td) => td.seconds); + } + + /** Integer milliseconds component for each element (NA β†’ null). */ + get milliseconds(): (number | null)[] { + return this._extractComponent((td) => td.milliseconds); + } + + /** Total number of milliseconds for each element (NA β†’ null). */ + get totalMilliseconds(): (number | null)[] { + return this._extractComponent((td) => td.totalMilliseconds); + } + + /** Total number of seconds (float) for each element (NA β†’ null). */ + get totalSeconds(): (number | null)[] { + return this._extractComponent((td) => td.totalSeconds); + } + + /** Total number of hours (float) for each element (NA β†’ null). */ + get totalHours(): (number | null)[] { + return this._extractComponent((td) => td.totalHours); + } + + /** Total number of days (float) for each element (NA β†’ null). */ + get totalDays(): (number | null)[] { + return this._extractComponent((td) => td.totalDays); + } + + // ─── Arithmetic ─────────────────────────────────────────────────────────── + + /** + * Add a scalar {@link Timedelta} to every element. NA propagates. + */ + add(other: TimedeltaArray | Timedelta): TimedeltaArray { + if (other instanceof Timedelta) { + const data = this._data.map((v, i) => (this._mask[i] ? v : v.add(other))); + return TimedeltaArray._fromRaw(data, this._mask.slice()); + } + if (other.size !== this.size) { + throw new RangeError(`TimedeltaArray: operand size mismatch (${this.size} vs ${other.size})`); + } + const data: Timedelta[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(Timedelta.fromMilliseconds(0)); + mask.push(true); + } else { + data.push((this._data[i] as Timedelta).add(other._data[i] as Timedelta)); + mask.push(false); + } + } + return TimedeltaArray._fromRaw(data, mask); + } + + /** + * Subtract a scalar {@link Timedelta} from every element. NA propagates. + */ + sub(other: TimedeltaArray | Timedelta): TimedeltaArray { + if (other instanceof Timedelta) { + const data = this._data.map((v, i) => (this._mask[i] ? v : v.sub(other))); + return TimedeltaArray._fromRaw(data, this._mask.slice()); + } + if (other.size !== this.size) { + throw new RangeError(`TimedeltaArray: operand size mismatch (${this.size} vs ${other.size})`); + } + const data: Timedelta[] = []; + const mask: boolean[] = []; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i] || other._mask[i]) { + data.push(Timedelta.fromMilliseconds(0)); + mask.push(true); + } else { + data.push((this._data[i] as Timedelta).sub(other._data[i] as Timedelta)); + mask.push(false); + } + } + return TimedeltaArray._fromRaw(data, mask); + } + + /** Multiply every element by a scalar. NA propagates. */ + mul(factor: number): TimedeltaArray { + const data = this._data.map((v, i) => (this._mask[i] ? v : v.mul(factor))); + return TimedeltaArray._fromRaw(data, this._mask.slice()); + } + + // ─── Conversion ──────────────────────────────────────────────────────────── + + /** Return an array of {@link Timedelta} or `null` for NA positions. */ + toArray(): (Timedelta | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : v)); + } + + // ─── Reductions ─────────────────────────────────────────────────────────── + + /** Sum of non-NA elements (millisecond precision). */ + sum(skipna = true): Timedelta | null { + let total = 0; + let hasNonNa = false; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + if (!skipna) { + return null; + } + continue; + } + total += (this._data[i] as Timedelta).totalMilliseconds; + hasNonNa = true; + } + return hasNonNa || skipna ? Timedelta.fromMilliseconds(total) : null; + } + + /** Minimum non-NA element. */ + min(): Timedelta | null { + let result: Timedelta | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timedelta; + if (result === null || v.totalMilliseconds < result.totalMilliseconds) { + result = v; + } + } + return result; + } + + /** Maximum non-NA element. */ + max(): Timedelta | null { + let result: Timedelta | null = null; + for (let i = 0; i < this._data.length; i++) { + if (this._mask[i]) { + continue; + } + const v = this._data[i] as Timedelta; + if (result === null || v.totalMilliseconds > result.totalMilliseconds) { + result = v; + } + } + return result; + } + + // ─── fillna ─────────────────────────────────────────────────────────────── + + /** Return a new TimedeltaArray with NAs replaced by `value`. */ + fillna(value: Timedelta): TimedeltaArray { + const data = this._data.map((v, i) => (this._mask[i] ? value : v)); + const mask = new Array(data.length).fill(false); + return TimedeltaArray._fromRaw(data, mask); + } + + // ─── Iteration ───────────────────────────────────────────────────────────── + + [Symbol.iterator](): Iterator { + let i = 0; + const data = this._data; + const mask = this._mask; + return { + next() { + if (i >= data.length) { + return { value: null, done: true }; + } + const value = mask[i] ? null : (data[i] ?? null); + i++; + return { value, done: false }; + }, + }; + } + + // ─── String representation ───────────────────────────────────────────────── + + toString(): string { + const items = this.toArray().map((v) => (v === null ? "" : v.toString())); + return `TimedeltaArray([${items.join(", ")}], dtype="${this.dtype}")`; + } + + // ─── Private helper ──────────────────────────────────────────────────────── + + private _extractComponent(fn: (td: Timedelta) => number): (number | null)[] { + return this._data.map((v, i) => (this._mask[i] ? null : fn(v))); + } +} diff --git a/src/core/flags.ts b/src/core/flags.ts new file mode 100644 index 00000000..546cb031 --- /dev/null +++ b/src/core/flags.ts @@ -0,0 +1,186 @@ +/** + * Flags β€” metadata flags for DataFrame and Series objects. + * + * Mirrors `pandas.core.flags.Flags`. Provides the `allowsDuplicateLabels` + * flag that controls whether duplicate row/column labels are permitted in the + * associated DataFrame or Series. + * + * @example + * ```ts + * import { DataFrame, DuplicateLabelError } from "tsb"; + * + * const df = DataFrame.fromColumns({ a: [1, 2, 3] }); + * df.flags.allowsDuplicateLabels; // true (default) + * + * df.flags.allowsDuplicateLabels = false; + * // Setting false on a DataFrame with no duplicates is fine. + * + * const dfDup = new DataFrame( + * new Map([["a", df.col("a")]]), + * df.index.append(df.index), // duplicate index + * ); + * dfDup.flags.allowsDuplicateLabels = false; // throws DuplicateLabelError + * ``` + * + * @packageDocumentation + */ + +import { DuplicateLabelError } from "../errors.ts"; + +// --------------------------------------------------------------------------- +// Structural interfaces (no imports from frame.ts / series.ts) +// --------------------------------------------------------------------------- + +/** + * Minimal structural interface satisfied by any `Index` instance. + * Defined here (instead of importing from base-index.ts) to avoid circular + * imports β€” frame.ts β†’ flags.ts must not require flags.ts β†’ frame.ts. + */ +interface IndexLike { + readonly values: readonly unknown[]; + readonly size: number; +} + +/** + * Structural interface satisfied by both `DataFrame` and `Series`. + * Used as the WeakMap key so flags.ts never imports the concrete classes. + */ +export interface FlaggedObject { + /** Row index of the object. */ + readonly index: IndexLike; +} + +// --------------------------------------------------------------------------- +// Internal state registry +// --------------------------------------------------------------------------- + +interface FlagsState { + allowsDuplicateLabels: boolean; +} + +const registry = new WeakMap(); + +function getState(obj: FlaggedObject): FlagsState { + let state = registry.get(obj); + if (state === undefined) { + state = { allowsDuplicateLabels: true }; + registry.set(obj, state); + } + return state; +} + +// --------------------------------------------------------------------------- +// Flags class +// --------------------------------------------------------------------------- + +/** + * Metadata flags for a `DataFrame` or `Series`. + * + * Accessible via `df.flags` or `series.flags`. Mutations are reflected + * immediately on the underlying object because state is stored in a + * module-level WeakMap keyed by the object reference. + * + * ### pandas reference + * `pandas.core.flags.Flags` + */ +export class Flags { + private readonly _obj: FlaggedObject; + + /** + * @param obj - The DataFrame or Series this Flags object is bound to. + * @param opts.allowsDuplicateLabels - Initial value for `allowsDuplicateLabels`. + * Defaults to `true` when not previously set. + */ + constructor(obj: FlaggedObject, opts: { allowsDuplicateLabels?: boolean } = {}) { + this._obj = obj; + if (opts.allowsDuplicateLabels !== undefined) { + getState(obj).allowsDuplicateLabels = opts.allowsDuplicateLabels; + } + } + + // ── allowsDuplicateLabels ───────────────────────────────────────────────── + + /** + * Whether duplicate labels (along any axis) are allowed. + * + * Defaults to `true`. When set to `false`, any existing duplicate labels + * trigger a `DuplicateLabelError` immediately. Future operations that would + * produce duplicate labels also raise. + * + * @example + * ```ts + * df.flags.allowsDuplicateLabels; // true + * df.flags.allowsDuplicateLabels = false; + * df.flags.allowsDuplicateLabels; // false + * ``` + */ + get allowsDuplicateLabels(): boolean { + return getState(this._obj).allowsDuplicateLabels; + } + + set allowsDuplicateLabels(value: boolean) { + getState(this._obj).allowsDuplicateLabels = value; + if (!value) { + this._validateNoDuplicates(); + } + } + + // ── helpers ─────────────────────────────────────────────────────────────── + + /** + * Raise `DuplicateLabelError` if the bound object currently has duplicate + * row-index labels. + */ + private _validateNoDuplicates(): void { + const { values } = this._obj.index; + const seen = new Set(); + for (const label of values) { + if (seen.has(label)) { + throw new DuplicateLabelError(`Index has duplicate keys: [${String(label)}]`); + } + seen.add(label); + } + } + + /** + * Raise `DuplicateLabelError` if `allowsDuplicateLabels` is `false` and + * the bound object has duplicate labels. Called by DataFrame/Series methods + * after operations that could introduce duplicates. + */ + raiseOnDuplicates(): void { + if (!this.allowsDuplicateLabels) { + this._validateNoDuplicates(); + } + } + + /** + * Return a copy of this Flags object bound to the **same** underlying object. + * + * The returned `Flags` shares state with the original β€” mutations to either + * are reflected in both (they both write to the same WeakMap entry). + */ + copy(): Flags { + return new Flags(this._obj); + } + + /** Human-readable representation mirroring pandas' `repr(df.flags)`. */ + toString(): string { + return ``; + } +} + +// --------------------------------------------------------------------------- +// Registry accessor (used by DataFrame.flags / Series.flags getters) +// --------------------------------------------------------------------------- + +/** + * Return (or lazily create) the `Flags` wrapper for the given object. + * + * Each call creates a *new* `Flags` wrapper object, but all wrappers for the + * same `obj` share the same state via the module-level WeakMap registry. + * + * @param obj - The DataFrame or Series to get flags for. + */ +export function getFlags(obj: FlaggedObject): Flags { + return new Flags(obj); +} diff --git a/src/core/frame.ts b/src/core/frame.ts index ec18d144..3f39052c 100644 --- a/src/core/frame.ts +++ b/src/core/frame.ts @@ -26,6 +26,8 @@ import type { ExpandingOptions } from "../window/index.ts"; import { Rolling } from "../window/index.ts"; import type { RollingOptions } from "../window/index.ts"; import { Index } from "./base-index.ts"; +import { getFlags } from "./flags.ts"; +import type { Flags } from "./flags.ts"; import { RangeIndex } from "./range-index.ts"; import { Series } from "./series.ts"; @@ -245,6 +247,21 @@ export class DataFrame { return this.index.size === 0 || this.columns.size === 0; } + /** + * Metadata flags for this DataFrame. + * + * Controls behaviour such as whether duplicate labels are allowed. + * + * @example + * ```ts + * df.flags.allowsDuplicateLabels; // true (default) + * df.flags.allowsDuplicateLabels = false; + * ``` + */ + get flags(): Flags { + return getFlags(this); + } + // ─── column access ──────────────────────────────────────────────────────── /** @@ -816,9 +833,7 @@ function isIndexLike(v: unknown): v is Index