@@ -52,6 +52,8 @@ df = ctx.from_pylist([{"a": 1, "b": "x"}, {"a": 2, "b": "y"}])
5252df = ctx.from_pandas(pandas_df)
5353df = ctx.from_polars(polars_df)
5454df = ctx.from_arrow(arrow_table)
55+ df = ctx.read_batch(record_batch) # one pa.RecordBatch, no named table
56+ df = ctx.read_batches([batch1, batch2]) # several pa.RecordBatch
5557
5658# From SQL
5759df = ctx.sql(" SELECT a, b FROM my_table WHERE a > 1" )
@@ -434,11 +436,19 @@ logical operators.
434436``` python
435437col(" a" ).is_null()
436438col(" a" ).is_not_null()
437- col(" a" ).fill_null(lit(0 )) # replace NULL with a value
439+ col(" a" ).fill_null(lit(0 )) # replace NULL with a value (single expression)
438440F.coalesce(col(" a" ), col(" b" )) # first non-null value
439441F.nullif(col(" a" ), lit(0 )) # return NULL if a == 0
440442```
441443
444+ To fill nulls across the whole DataFrame (optionally limited to a subset of
445+ columns), use the DataFrame-level method:
446+
447+ ``` python
448+ df.fill_null(0 ) # every column
449+ df.fill_null(0 , subset = [" a" , " b" ]) # only these columns
450+ ```
451+
442452### CASE / WHEN
443453
444454``` python
@@ -466,6 +476,15 @@ import pyarrow as pa
466476col(" a" ).cast(pa.float64())
467477col(" a" ).cast(pa.utf8())
468478col(" a" ).cast(pa.date32())
479+
480+ col(" a" ).try_cast(pa.int32()) # like cast(), but yields NULL on failure instead of erroring
481+ ```
482+
483+ To cast several columns at once at the DataFrame level, pass a mapping to
484+ ` df.cast(...) ` :
485+
486+ ``` python
487+ df.cast({" a" : pa.float64(), " b" : pa.int32()})
469488```
470489
471490### Aliasing
@@ -477,7 +496,7 @@ col("a").cast(pa.date32())
477496### BETWEEN and IN
478497
479498``` python
480- col(" a" ).between(lit( 1 ), lit( 10 )) # 1 <= a <= 10
499+ col(" a" ).between(1 , 10 ) # 1 <= a <= 10 (bounds auto-wrap)
481500F.in_list(col(" a" ), [lit(1 ), lit(2 ), lit(3 )]) # a IN (1, 2, 3)
482501F.in_list(col(" a" ), [lit(1 ), lit(2 )], negated = True ) # a NOT IN (1, 2)
483502```
@@ -534,7 +553,7 @@ F.array_transform(col("a"), F.lambda_(["v"], F.lambda_var("v") * lit(2)))
534553| ` CASE x WHEN 1 THEN 'a' END ` | ` F.case(col("x")).when(lit(1), lit("a")).end() ` |
535554| ` CASE WHEN x > 1 THEN 'a' END ` | ` F.when(col("x") > 1, lit("a")).end() ` |
536555| ` x IN (1, 2, 3) ` | ` F.in_list(col("x"), [lit(1), lit(2), lit(3)]) ` |
537- | ` x BETWEEN 1 AND 10 ` | ` col("x").between(lit(1), lit(10) ) ` |
556+ | ` x BETWEEN 1 AND 10 ` | ` col("x").between(1, 10 ) ` |
538557| ` CAST(x AS DOUBLE) ` | ` col("x").cast(pa.float64()) ` |
539558| ` ROW_NUMBER() OVER (...) ` | ` F.row_number(partition_by=[...], order_by=[...]) ` |
540559| ` SUM(x) OVER (...) ` | ` F.sum(col("x")).over(window) ` |
@@ -556,8 +575,9 @@ F.array_transform(col("a"), F.lambda_(["v"], F.lambda_var("v") * lit(2)))
556575 - arithmetic between two literals with no column involved:
557576 ` lit(1) - col("discount") ` is fine, but ` lit(1) - lit(2) ` needs both
558577 - values that must carry a specific Arrow type, via ` lit(pa.scalar(...)) `
559- - ` .when(...) ` , ` .otherwise(...) ` , ` F.nullif(...) ` , ` .between(...) ` ,
560- ` F.in_list(...) ` and similar method/function arguments
578+ - ` .when(...) ` , ` .otherwise(...) ` , ` F.nullif(...) ` , ` F.in_list(...) `
579+ and similar method/function arguments (note: ` .between(...) `
580+ auto-wraps its bounds, so ` col("a").between(1, 10) ` needs no ` lit() ` )
561581
5625823 . ** Column name quoting** : Column names are normalized to lowercase by default
563583 in both ` select("...") ` and ` col("...") ` . To reference a column with
@@ -576,7 +596,8 @@ F.array_transform(col("a"), F.lambda_(["v"], F.lambda_var("v") * lit(2)))
576596 partition frame, set ` window_frame=WindowFrame("rows", None, None) ` .
577597
5785986 . ** Arithmetic on aggregates belongs in a later ` select ` , not inside
579- ` aggregate ` ** : Each item in the aggregate list must be a single aggregate
599+ ` aggregate ` ** * (applies to datafusion-python 53 and earlier; fixed in 54)* :
600+ Each item in the aggregate list must be a single aggregate
580601 call (optionally aliased). Combining aggregates with arithmetic inside
581602 ` aggregate(...) ` fails with ` Internal error: Invalid aggregate expression ` .
582603 Alias the aggregates, then compute the combination downstream:
@@ -609,6 +630,12 @@ F.array_transform(col("a"), F.lambda_(["v"], F.lambda_var("v") * lit(2)))
609630 # (note: join_on keeps both key columns in the output, unlike on="key")
610631 li.join_on(failed, col(" l_orderkey" ) == col(" o_orderkey" ))
611632 ```
633+ When the same column name exists on both sides, ` DataFrame.col(name) `
634+ (and ` DataFrame.column(name) ` ) returns a column reference qualified to
635+ that DataFrame, which disambiguates the predicate explicitly:
636+ ``` python
637+ li.join_on(failed, li.col(" l_orderkey" ) == failed.col(" o_orderkey" ))
638+ ```
612639
613640## Idiomatic Patterns
614641
@@ -746,7 +773,7 @@ F.left(col("c_phone"), lit(2)) # prefix shortcut
746773
747774** Array/List** : ` array ` , ` make_array ` , ` array_agg ` , ` array_length ` ,
748775` array_element ` , ` array_slice ` , ` array_append ` , ` array_prepend ` ,
749- ` array_concat ` , ` array_has ` , ` array_has_all ` , ` array_has_any ` , ` array_position ` ,
776+ ` array_concat ` , ` array_contains ` , ` array_has ` , ` array_has_all ` , ` array_has_any ` , ` array_position ` ,
750777` array_remove ` , ` array_distinct ` , ` array_sort ` , ` array_reverse ` , ` flatten ` ,
751778` array_to_string ` , ` array_intersect ` , ` array_union ` , ` array_except ` ,
752779` generate_series `
@@ -808,7 +835,6 @@ both `functions` and `functions.spark` may behave differently:
808835| ` concat ` | NULL inputs treated as empty | NULL inputs propagate to NULL |
809836| ` round ` | HALF_EVEN (banker's) | HALF_UP |
810837| ` trunc ` | Numeric truncation | Date truncation |
811- | ` substring ` | 1-indexed | 1-indexed (parity) |
812838
813839Pick the namespace whose semantics match your intent — both stay imported
814840side by side; ` enable_spark_functions() ` only affects SQL.
0 commit comments