from pyspark.sql import SparkSessionfrom pyspark.sql.functions import sum, col, count, when, approxQuantilespark = SparkSession.builder.appName("TravelInsuranceAnalysis").getOrCreate()df = spark.read.csv("hdfs://path/to/insurance_data.csv", header=True, inferSchema=True)def get_top_selling_products(): product_sales = df.groupBy("Product Name").agg(sum("Net Sales").alias("total_sales")) top_10_products = product_sales.orderBy(col("total_sales").desc()).limit(10) return top_10_productsdef analyze_high_value_customers(): high_value_threshold = df.approxQuantile("Net Sales", [0.9], 0.0)[0] high_value_df = df.filter(col("Net Sales") >= high_value_threshold) customer_features = high_value_df.groupBy("Age Group", "Gender", "Destination").agg(count("*").alias("count"), sum("Net Sales").alias("total_spending")) return customer_features.orderBy(col("total_spending").desc())def calculate_claim_rates_by_product(): total_policies_df = df.groupBy("Product Name").agg(count("*").alias("policy_count")) claim_df = df.filter(col("Claim") == "Yes") claims_count_df = claim_df.groupBy("Product Name").agg(count("*").alias("claim_count")) claim_rate_df = total_policies_df.join(claims_count_df, "Product Name", "left_outer").fillna(0, subset=["claim_count"]) final_df = claim_rate_df.withColumn("claim_rate", (col("claim_count") / col("policy_count")) * 100) return final_df.orderBy(col("claim_rate").desc())