df.dropna(how="any", subset=["important_col"]) df.fillna("age": 0, "name": "unknown")
def transform_etl(): raw = spark.read.json("raw_data/*") cleaned = raw.filter("status = 'active'") \ .dropDuplicates(["user_id"]) enriched = cleaned.join(lookup_table, "product_id") enriched.write.partitionBy("date").parquet("warehouse/") beginning apache spark 3 pdf
Use unpersist() to free memory.