Simple segment

customers_who_might_be_interested_in_investments

select
  customer_id
from
  hive_metastore.odap_features.customer
where
  transactions_sum_amount_in_last_30d >= 50000 and
  investice_web_visits_count_in_last_90d > 0 and
	

Segment with join

customers_who_havent_paid_their_loans_for_3_months

select
  customer_id
from
  hive_metastore.odap_features.customer as c
join
  hive_metastore.sdm.loans as l
where
  c.gender == "M" and
  l.last_payment_date < (current_date() - INTERVAL 3 MONTH)

Segment using a model

customers_with_high_ptb_mortgage

import mlflow
from pyspark.sql.functions import struct, col

logged_model = "runs:/7eab177aa30543db8daf6e22e9dede85/ptb_mortgage"

# Load model as a Spark UDF. Override result_type if the model does not return double values.
loaded_model = mlflow.pyfunc.spark_udf(
    spark, model_uri=logged_model, result_type="double"
)

# Predict ptb_mortgage, filter for high values, select customer_ids
df_final = (
    df.withColumn("ptb_mortgage", loaded_model(struct(*map(col, df.columns))))
    .filter(col("ptb_mortgage") > 0.7)
    .select("customer_id")
)