RegressionTestResult¶

RegressionTestResult is a dataclass with methods for inspecting the difference between two dataframes.

`pyspark_regression.RegressionTest` ¶

A class with methods to compare differences between two Spark Dataframes.

Parameters:

Name	Type	Description	Default
`df_old`	`DataFrame`	Old DataFrame	required
`df_new`	`DataFrame`	New DataFrame	required
`pk`	`str \| Column`	Primary Key shared between df_old and df_new. Can be either a string or a Spark Column.	required
`table_name`	`str = 'df'`	Name of table	`'df'`
`num_samples`	`int = 5`	Number of records to pull when retrieving samples.	`5`
`use_cache`	`bool = False`	If True, will ignore checkpointing and use caching on dataframes.	`False`
`checkpoint_dir`	`Optional[str] = None`	If set, Spark will checkpoint dataframes for optimization. Ignored if use_cache=True	`None`

Returns: RegressionTest

Source code in src/pyspark_regression/main.py

class RegressionTest:
    """
    A class with methods to compare differences between two Spark Dataframes.

    Args:
        df_old (DataFrame): Old DataFrame
        df_new (DataFrame): New DataFrame
        pk (str | Column): Primary Key shared between df_old and df_new. Can be either a string or a Spark Column.
        table_name (str = 'df'): Name of table
        num_samples (int = 5): Number of records to pull when retrieving samples.
        use_cache (bool = False): If True, will ignore checkpointing and use caching on dataframes.
        checkpoint_dir (Optional[str] = None): If set, Spark will checkpoint dataframes for optimization. Ignored if use_cache=True
    Returns:
        RegressionTest
    """

    def __init__(
        self,
        spark: SparkSession,
        df_old: DataFrame,
        df_new: DataFrame,
        pk: str,
        table_name: str = "df",
        num_samples: int = 5,
        use_cache: bool = False,
        checkpoint_dir: Optional[str] = None,
    ):

        self.spark: SparkSession = spark
        "Spark Session"
        self.pk: str = pk
        "Primary Key shared between df_old and df_new. Can be either a string or a Spark Column."
        self.table_name: str = table_name
        "Name of table"
        self.num_samples: int = num_samples
        "Number of records to pull when retrieving samples."
        self.use_cache = use_cache
        "If True, will use caching on dataframes and ignore checkpointing."
        self.checkpoint_dir = checkpoint_dir
        "If set, Spark will checkpoint dataframes for optimization. Ignored if use_cache=True."

        # Ensure 'pk' and 'diff_cols' are not columns in the old or new dataframes.
        for c in ["pk", "diff_cols"]:
            if c in df_old.columns:
                raise KeyError(f"The column name '{c}' itself is reserved. Please remove it from df_old.")
            if c in df_new.columns:
                raise KeyError(f"The column name '{c}' itself is reserved. Please remove it from df_new.")

        # Ensure the pk is not iterally 'pk'
        if pk not in df_old.columns:
            raise KeyError(f"pk '{pk}' is missing from df_old.")
        if pk not in df_new.columns:
            raise KeyError(f"pk '{pk}' is missing from df_new.")

        self.df_old: DataFrame = df_old.withColumn("pk", F.col(self.pk)).orderBy("pk")
        "Old DataFrame"
        self.df_new: DataFrame = df_new.withColumn("pk", F.col(self.pk)).orderBy("pk")
        "New DataFrame"

        # Handle caching
        self.df_duplicate_old = self._df_duplicate_old
        "DataFrame of duplicate pks in old table, with count of duplicates"
        self.df_duplicate_new = self._df_duplicate_new
        "DataFrame of duplicate pks in new table, with count of duplicates"
        self.df_orphan_old = self._df_orphan_old
        "DataFrame of orphan pks in old table"
        self.df_orphan_new = self._df_orphan_new
        "DataFrame of orphan pks in new table"
        self.df_comparable = self._df_comparable
        """
        DataFrame of rows that are eligible for regression testing.

        Rows are eligible if they:

        1. Are not duplicates
        2. Are not orphans
        3. Have matching primary keys between `df_old` and `df_new`
        """
        self.df_regression = self._df_regression
        "Same as df_comparable, but with column_map filtered for actual differences and renamed to diffs."
        self.df_diff = self._df_diff
        "DataFrame that pivots and filters `df_regression` to show one row per column containing values that are different"
        self.df_diff_cols = self._df_diff_cols
        "DataFrame of column names that contain diffs."
        self.df_diff_summary = self._df_diff_summary
        "A summary of df_diff, aggregating a count of records/pks"
        self.df_diff_sample = self._df_diff_sample
        "Same as df_diff, but limited to num_sample rows per column per diff category"

        if self.use_cache:
            self.df_duplicate_old.cache()
            self.df_duplicate_new.cache()
            self.df_orphan_old.cache()
            self.df_orphan_new.cache()
            self.df_regression.cache()
            self.df_diff.cache()
        elif self.checkpoint_dir is not None:
            self.spark.sparkContext.setCheckpointDir(self.checkpoint_dir)
            self.df_duplicate_old.checkpoint()
            self.df_duplicate_new.checkpoint()
            self.df_orphan_old.checkpoint()
            self.df_orphan_new.checkpoint()
            self.df_regression.checkpoint()
            self.df_diff.checkpoint()

    # Schema Analysis
    # -------------------------------------------------------------------------
    @cached_property
    def columns_old(self) -> Set[str]:
        "Columns in df_old"
        return set([col for col in self.df_old.columns if not col == "pk"])

    @cached_property
    def columns_new(self) -> Set[str]:
        "Columns in df_new"
        return set([col for col in self.df_new.columns if not col == "pk"])

    @cached_property
    def columns_all(self) -> Set[str]:
        "All columns between df_old and df_new"
        return self.columns_new.union(self.columns_old)

    @cached_property
    def columns_added(self) -> Set[str]:
        "Columns in df_new that are not in df_old"
        return self.columns_new.difference(self.columns_old)

    @cached_property
    def columns_removed(self) -> Set[str]:
        "Columns in df_old that are not in df_new"
        return self.columns_old.difference(self.columns_new)

    @cached_property
    def columns_kept(self) -> Set[str]:
        "Columns that are in both df_old and df_new"
        return self.columns_old.intersection(self.columns_new)

    @cached_property
    def schema_mutations(self) -> Set[SchemaMutation]:
        "Detail for all schema mutations between df_old and df_new"
        schema_mutations = set()
        for col in self.columns_kept:
            schema_old = self.df_old.schema[col].jsonValue()
            schema_new = self.df_new.schema[col].jsonValue()
            for attribute in schema_old.keys():
                if schema_old[attribute] != schema_new[attribute]:
                    schema_mutations.add(
                        SchemaMutation(
                            column_name=col,
                            attribute=attribute,
                            value_old=str(schema_old[attribute]),
                            value_new=str(schema_new[attribute]),
                        )
                    )
        return schema_mutations

    @cached_property
    def schema_mutations_type(self) -> Set[SchemaMutation]:
        "Detail for schema mutations of 'data_type' attribute"
        return set([sm for sm in self.schema_mutations if sm.attribute == "type"])

    @cached_property
    def schema_mutations_nullable(self) -> Set[SchemaMutation]:
        "Detail for schema mutations of 'nullable' attribute"
        return set([sm for sm in self.schema_mutations if sm.attribute == "nullable"])

    @cached_property
    def schema_mutations_metadata(self) -> Set[SchemaMutation]:
        "Detail for schema mutations of 'metadata' attribute"
        return set([sm for sm in self.schema_mutations if sm.attribute == "metadata"])

    @cached_property
    def columns_changed_type(self) -> Set[str]:
        "Columns in both df_old and df_new for which their 'data_type' attribute changed"
        return set([sm.column_name for sm in self.schema_mutations_type])

    @cached_property
    def columns_changed_nullable(self) -> Set[str]:
        "Columns in both df_old and df_new for which their 'nullable' attribute changed"
        return set([sm.column_name for sm in self.schema_mutations_nullable])

    @cached_property
    def columns_changed_metadata(self) -> Set[str]:
        "Columns in both df_old and df_new for which their 'metadata' attribute changed"
        return set([sm.column_name for sm in self.schema_mutations_metadata])

    @cached_property
    def columns_comparable(self) -> Set[str]:
        """
        Columns that can be compared for regression test. Comparable columns must

        1. be present in df_old and df_new
        2. have the same data_type in df_old and df_new
        """
        return self.columns_kept.difference(self.columns_changed_type)

    # Base Table Info
    # -------------------------------------------------------------------------
    @cached_property
    def count_record_old(self) -> int:
        "Count of records in df_old"
        return self.df_old.count()

    @cached_property
    def count_record_new(self) -> int:
        "Count of records in df_new"
        return self.df_new.count()

    @cached_property
    def count_pk_old(self) -> int:
        "Count of pks in df_old"
        return self.df_old.select(F.col("pk")).distinct().count()

    @cached_property
    def count_pk_new(self) -> int:
        "Count of pks in df_new"
        return self.df_new.select(F.col("pk")).distinct().count()

    # Duplicate Analysis
    # -------------------------------------------------------------------------
    @property
    def _df_duplicate_old(self) -> DataFrame:
        return (
            self.df_old.select(F.col("pk"))
            .groupBy(F.col("pk"))
            .agg((F.count(F.col("pk")) - 1).alias("count_record_duplicate"))
            .filter(F.col("count_record_duplicate") > 0)
        )

    @property
    def _df_duplicate_new(self) -> DataFrame:
        return (
            self.df_new.select(F.col("pk"))
            .groupBy(F.col("pk"))
            .agg((F.count(F.col("pk")) - 1).alias("count_record_duplicate"))
            .filter(F.col("count_record_duplicate") > 0)
        )

    @cached_property
    def count_record_duplicate_old(self) -> int:
        "Count of duplicate records in df_old"
        return self.df_duplicate_old.agg(F.sum(F.col("count_record_duplicate")).alias("crd")).collect()[0][0] or 0

    @cached_property
    def count_record_duplicate_new(self) -> int:
        "Count of duplicate records in df_new"
        return self.df_duplicate_new.agg(F.sum(F.col("count_record_duplicate")).alias("crd")).collect()[0][0] or 0

    @cached_property
    def count_pk_duplicate_old(self) -> int:
        "Count of pks that have duplicates in df_old"
        return self.df_duplicate_old.count() or 0

    @cached_property
    def count_pk_duplicate_new(self) -> int:
        "Count of pks that have duplicates in df_new"
        return self.df_duplicate_new.count() or 0

    @cached_property
    def sample_pk_duplicate_old(self) -> tuple:
        "num_sample samples of pks that have duplicate records in df_old"
        return tuple(
            [
                row.pk
                for row in (
                    self.df_duplicate_old.orderBy([F.col("count_record_duplicate").desc(), F.col("pk")])  # sort worst offenders first, then pk
                    .select(F.col("pk"))
                    .limit(self.num_samples)
                    .collect()
                )
            ]
        )

    @cached_property
    def sample_pk_duplicate_new(self) -> tuple:
        "num_sample samples of pks that have duplicate records in df_new"
        return tuple(
            [
                row.pk
                for row in (
                    self.df_duplicate_new.orderBy([F.col("count_record_duplicate").desc(), F.col("pk")])  # sort worst offenders first, then pk
                    .select(F.col("pk"))
                    .limit(self.num_samples)
                    .collect()
                )
            ]
        )

    @cached_property
    def has_symmetric_duplicates(self) -> bool:
        "True if duplicates in df_old and df_new are exactly the same."
        if self.count_record_duplicate_old == 0:  # Can't have symmetric duplicates without at least one duplicate
            return False
        __df_dup_old_comp = self.df_old.join(self.df_duplicate_old, how="left_semi", on=["pk"]).select(list(self.columns_comparable))
        __df_dup_new_comp = self.df_new.join(self.df_duplicate_new, how="left_semi", on=["pk"]).select(list(self.columns_comparable))
        return (__df_dup_old_comp.exceptAll(__df_dup_new_comp)).union(__df_dup_new_comp.exceptAll(__df_dup_old_comp)).count() == 0

    # Orphan Analysis
    # -------------------------------------------------------------------------
    @property
    def _df_orphan_old(self) -> DataFrame:
        return self.df_old.join(self.df_new, how="left_anti", on=["pk"]).select(F.col("pk")).distinct()

    @property
    def _df_orphan_new(self) -> DataFrame:
        return self.df_new.join(self.df_old, how="left_anti", on=["pk"]).select(F.col("pk")).distinct()

    @cached_property
    def count_pk_orphan_old(self) -> int:
        "Count of pks that are in df_old but not df_new"
        return self.df_orphan_old.count() or 0

    @cached_property
    def count_pk_orphan_new(self) -> int:
        "Count of pks that are in df_new but not df_old"
        return self.df_orphan_new.count() or 0

    @cached_property
    def sample_pk_orphan_old(self) -> tuple:
        "num_sample samples of pks that are in df_old but not df_new"
        return tuple([row.pk for row in self.df_orphan_old.select(F.col("pk")).orderBy(F.col("pk")).limit(self.num_samples).collect()])

    @cached_property
    def sample_pk_orphan_new(self) -> tuple:
        "num_sample samples of pks that are in df_new but not df_old"
        return tuple([row.pk for row in self.df_orphan_new.select(F.col("pk")).orderBy(F.col("pk")).limit(self.num_samples).collect()])

    # Diff Analysis
    # -------------------------------------------------------------------------
    @property
    def _df_comparable(self) -> DataFrame:
        df_old_no_dups = self.df_old.join(self.df_duplicate_old, how="left_anti", on=["pk"])
        df_new_no_dups = self.df_new.join(self.df_duplicate_new, how="left_anti", on=["pk"])
        column_to_data_type = {s.name: s.dataType.typeName() for s in self.df_old.schema}
        df_comparable = (
            df_old_no_dups.alias("o")
            .join(df_new_no_dups.alias("n"), on=(F.col("o.pk") == F.col("n.pk")))
            .select(
                F.col("o.pk"),
                F.array(
                    [
                        F.create_map(
                            F.lit("column_name"),
                            F.lit(c),
                            F.lit("data_type"),
                            F.lit(column_to_data_type[c]),
                            F.lit("old_value"),
                            F.col(f"o.{c}").cast(StringType()),
                            F.lit("new_value"),
                            F.col(f"n.{c}").cast(StringType()),
                        )
                        for c in self.columns_comparable
                    ]
                ).alias("column_map"),
            )
        )
        return df_comparable

    @cached_property
    def count_pk_comparable(self) -> int:
        "Count of comparable records between df_old and df_new"
        df_excluded_new = self.df_orphan_new.select("pk").union(self.df_duplicate_new.select("pk")).distinct()
        df_excluded_old = self.df_orphan_old.select("pk").union(self.df_duplicate_old.select("pk")).distinct()
        count_usable_new = self.count_pk_new - df_excluded_new.count()
        count_usable_old = self.count_pk_old - df_excluded_old.count()
        return min(count_usable_new, count_usable_old)

    @property
    def _df_regression(self):
        return self.df_comparable.select(
            F.col("pk"),
            F.expr(
                """
                        FILTER(
                            column_map,
                            column -> (
                                (column.old_value != column.new_value)
                                OR (column.old_value IS NULL AND column.new_value IS NOT NULL)
                                OR (column.old_value IS NOT NULL AND column.new_value IS NULL)
                            )
                        )
                        """
            ).alias("diffs"),
        ).filter(F.size(F.col("diffs")) > 0)

    @cached_property
    def count_record_diff(self) -> int:
        "Count of records with diffs"
        return self.df_regression.count() or 0

    @cached_property
    def count_pk_diff(self) -> int:
        "Count of pks with diffs"
        return self.df_regression.select(F.col("pk")).distinct().count() or 0

    @property
    def _df_diff_cols(self) -> DataFrame:
        return (
            self.df_regression.select(F.explode(F.col("diffs")).alias("diff"))
            .select(F.col("diff.column_name").alias("diff_cols"))
            .distinct()
            .orderBy(F.col("diff_cols"))
        )

    @cached_property
    def columns_diff(self) -> Set[str]:
        "Columns containing at least one value difference between df_old and df_new"
        return set([row.diff_cols for row in self.df_diff_cols.collect()])

    @staticmethod
    def __quote_if_string(col: Column, data_type: Column) -> Column:
        return F.when(data_type == F.lit("string"), F.concat(F.lit("'"), col, F.lit("'"))).otherwise(col)

    @staticmethod
    def __diff_category(x: Column, y: Column, data_type: Column) -> Column:
        """
        Categorizes the difference between two columns.
        """
        col_default = F.lit("uncategorized")
        return (
            F.when(x.isNull() & y.isNotNull(), F.lit("null flip (null -> not null)"))
            .when(x.isNotNull() & y.isNull(), F.lit("null flip (not null -> null)"))
            .when((x == F.lit("None")) & y.isNull(), F.lit("'None' flip ('None' -> null)"))
            .when(x.isNull() & (y == F.lit("None")), F.lit("'None' flip (null -> 'None')"))
            .when((x == F.lit("None")) & y.isNotNull(), F.lit("'None' flip ('None' -> not null)"))
            .when(x.isNotNull() & (y == F.lit("None")), F.lit("'None' flip (not null -> 'None')"))
            .otherwise(
                F.when(
                    data_type == F.lit("string"),
                    F.when((F.trim(y) == x) & (F.ltrim(y) != x) & (F.rtrim(y) != x), F.lit("padding added (left and right)"))
                    .when((F.trim(x) == y) & (F.ltrim(x) != y) & (F.rtrim(x) != y), F.lit("padding removed (left and right)"))
                    .when(F.ltrim(y) == x, F.lit("padding added (left)"))
                    .when(F.rtrim(y) == x, F.lit("padding added (right)"))
                    .when(F.ltrim(x) == y, F.lit("padding removed (left)"))
                    .when(F.rtrim(x) == y, F.lit("padding removed (right)"))
                    .when(F.upper(x) == y, F.lit("capitalization added"))
                    .when(F.lower(y) == x, F.lit("capitalization added"))
                    .when(F.upper(y) == x, F.lit("capitalization removed"))
                    .when(F.lower(x) == y, F.lit("capitalization removed"))
                    .when(F.upper(x) == F.upper(y), F.lit("capitalization changed"))
                    .when(x.startswith(y), F.lit("truncation added"))
                    .when(y.startswith(x), F.lit("truncation removed"))
                    .otherwise(col_default),
                )
                .when(
                    data_type == F.lit("float"),
                    F.when(F.round(x.cast(FloatType()), 2) == F.round(y.cast(FloatType()), 2), F.lit("rounding"))
                    .when(x.cast(FloatType()) % y.cast(FloatType()) == F.lit(0), F.lit("multiple"))
                    .when(y.cast(FloatType()) % x.cast(FloatType()) == F.lit(0), F.lit("multiple"))
                    .otherwise(col_default),
                )
                .when(
                    # Because I cant know the precision, use double for decimal
                    data_type.isin("double", "decimal"),
                    F.when(F.round(x.cast(DoubleType()), 2) == F.round(y.cast(DoubleType()), 2), F.lit("rounding"))
                    .when(x.cast(DoubleType()) % y.cast(DoubleType()) == F.lit(0), F.lit("multiple"))
                    .when(y.cast(DoubleType()) % x.cast(DoubleType()) == F.lit(0), F.lit("multiple"))
                    .otherwise(col_default),
                )
                .when(
                    data_type == F.lit("timestamp"),
                    F.when(F.abs(x.cast(TimestampType()).cast(LongType()) - y.cast(TimestampType()).cast(LongType())) % 3600 == 0, F.lit("hour shift"))
                    .when(
                        F.date_trunc("millisecond", x.cast(TimestampType())) == F.date_trunc("millisecond", y.cast(TimestampType())),
                        F.lit("millisecond truncation"),
                    )
                    .when(F.date_trunc("second", x.cast(TimestampType())) == y.cast(TimestampType()), F.lit("time removed"))
                    .when(F.date_trunc("second", y.cast(TimestampType())) == x.cast(TimestampType()), F.lit("time added"))
                    .otherwise(col_default),
                )
                .when(
                    data_type == F.lit("boolean"),
                    F.when(x.cast(BooleanType()) & ~y.cast(BooleanType()), "boolean flip (true -> false)")
                    .when(y.cast(BooleanType()) & ~x.cast(BooleanType()), "boolean flip (false -> true)")
                    .otherwise(col_default),
                )
                .otherwise(col_default)
            )
        )

    @property
    def _df_diff(self):
        return self.df_regression.select(
            F.col("pk"),
            F.explode(F.col("diffs")).alias("diff"),
        ).select(
            F.col("diff.column_name").alias("column_name"),
            F.col("diff.data_type").alias("data_type"),
            F.col("pk"),
            self.__quote_if_string(F.col("diff.old_value"), F.col("diff.data_type")).alias("old_value"),
            self.__quote_if_string(F.col("diff.new_value"), F.col("diff.data_type")).alias("new_value"),
            self.__diff_category(
                F.col("diff.old_value"),
                F.col("diff.new_value"),
                F.col("diff.data_type"),
            ).alias("diff_category"),
        )

    @property
    def _df_diff_summary(self) -> DataFrame:
        return (
            self.df_diff.groupBy(F.col("column_name"), F.col("data_type"), F.col("diff_category"))
            .agg(F.count("pk").alias("count_record"), F.countDistinct("pk").alias("count_pk"))
            .orderBy(F.col("column_name"), F.col("diff_category"))
            .withColumn("count_pk_%oT", F.concat(F.round(F.col("count_pk") / F.lit(self.count_pk_comparable) * 100, 1).cast(StringType()), F.lit("%")))
        )

    # For each column_name and diff_category, provide samples
    @property
    def _df_diff_sample(self) -> DataFrame:
        return (
            self.df_diff.withColumn("rn", F.row_number().over(Window.partitionBy([F.col("column_name"), F.col("diff_category")]).orderBy(F.col("pk"))))
            .filter(F.col("rn") <= F.lit(self.num_samples))
            .drop("rn")
            .orderBy(F.col("column_name"), F.col("diff_category"), F.col("pk"))
        )

    # Regression Test Results
    # -------------------------------------------------------------------------
    @cached_property
    def success(self) -> bool:
        """
        Whether the table passed regression test.

        Regression Tests fail when:

        - Columns are added
        - Columns are removed
        - Columns change data_type
        - Duplicates exist (and are not symmetrical)
        - Orphans exist
        - Value Diffs exist
        """
        if self.columns_added:
            return False
        elif self.columns_removed:
            return False
        elif self.columns_changed_type:
            return False
        elif self.count_pk_duplicate_old > 0 and not self.has_symmetric_duplicates:
            return False
        elif self.count_pk_orphan_old > 0 or self.count_pk_orphan_new > 0:
            return False
        elif self.count_pk_diff > 0:
            return False
        else:
            return True

    # Summary
    # -------------------------------------------------------------------------
    @property
    def summary(self) -> str:
        """
        A string-based report that summarizes the results of the Regression Test in Markdown.
        """
        timer_start = perf_counter()
        report: List[str] = []

        if self.success:
            report.append(f"# {self.table_name}: SUCCESS")
        else:
            report.append(f"# {self.table_name}: FAILURE")

        if not self.success:
            report += [
                "\n### Table stats",
                f"- Count records in old {self.table_name}: {self.count_record_old}",
                f"- Count records in new {self.table_name}: {self.count_record_new}",
                f"- Count pks in old {self.table_name}: {self.count_pk_old}",
                f"- Count pks in new {self.table_name}: {self.count_pk_new}",
            ]

        if self.columns_added or self.columns_removed:
            report.append("\n### Column Changes")
            if self.columns_added:
                report.append(f"- Columns Added: {list(self.columns_added)}")
            if self.columns_removed:
                report.append(f"- Columns Removed: {list(self.columns_removed)}")

        if self.schema_mutations:
            report.append("\n### Schema Mutations")
            for sm in self.schema_mutations:
                report.append(f"- For column '{sm.column_name}', attribute '{sm.attribute}' changed from '{sm.value_old}' to '{sm.value_new}'.")

        if self.count_record_duplicate_old or self.count_record_duplicate_new:
            report += [
                "\n### Duplicates",
                (
                    f"- Count of duplicate records in old {self.table_name}: {self.count_record_duplicate_old}"
                    + f" (%oT: {(self.count_record_duplicate_old / self.count_record_old):.1%})"
                    if self.count_record_old > 0
                    else ""
                ),  # noqa: E501
                (
                    f"- Count of duplicate records in new {self.table_name}: {self.count_record_duplicate_new}"
                    + f" (%oT: {(self.count_record_duplicate_new / self.count_record_new):.1%})"
                    if self.count_record_new > 0
                    else ""
                ),  # noqa: E501
                (
                    f"- Count of duplicate pks in old {self.table_name}: {self.count_pk_duplicate_old}"
                    + f" (%oT: {(self.count_pk_duplicate_old / self.count_pk_old):.1%})"
                    if self.count_pk_old > 0
                    else ""
                ),  # noqa: E501
                (
                    f"- Count of duplicate pks in new {self.table_name}: {self.count_pk_duplicate_new}"
                    + f" (%oT: {(self.count_pk_duplicate_new / self.count_pk_new):.1%})"
                    if self.count_pk_new > 0
                    else ""
                ),  # noqa: E501
                f"- Sample of duplicate pks in old {self.table_name}: {[str(sample) for sample in self.sample_pk_duplicate_old]}",
                f"- Sample of duplicate pks in new {self.table_name}: {[str(sample) for sample in self.sample_pk_duplicate_new]}",
            ]
            if self.has_symmetric_duplicates:
                report.append("**NOTE: Duplicates are exactly the same between df_old and df_new**")

        if self.count_pk_orphan_old or self.count_pk_orphan_new:
            report += [
                "\n### Orphans",
                (
                    f"- Count of orphan pks in old {self.table_name}: {self.count_pk_orphan_old}"
                    + f" (%oT: {(self.count_pk_orphan_old / self.count_pk_old):.1%})"
                    if self.count_pk_old > 0
                    else ""
                ),  # noqa: E501
                (
                    f"- Count of orphan pks in new {self.table_name}: {self.count_pk_orphan_new}"
                    + f" (%oT: {(self.count_pk_orphan_new / self.count_pk_new):.1%})"
                    if self.count_pk_new > 0
                    else ""
                ),  # noqa: E501
                f"- Sample of orphan pks in old {self.table_name}: {[str(sample) for sample in self.sample_pk_orphan_old]}",
                f"- Sample of orphan pks in new {self.table_name}: {[str(sample) for sample in self.sample_pk_orphan_new]}",
            ]

        if self.count_pk_diff:
            report += [
                "\n### Diffs",
                f"- Columns with diffs: {self.columns_diff}",
                f"- Pks with diffs: {self.count_pk_diff} (%oT: {(self.count_pk_diff / self.count_pk_comparable):.1%})\n",
                "Diff Summary:\n",
                self.df_diff_summary.toPandas().to_markdown(index=False),  # type: ignore [attr-defined]
                "\n",
                "Diff Samples: (5 samples per column_name, per diff_category, per is_duplicate)\n",
                self.df_diff_sample.toPandas().to_markdown(index=False),  # type: ignore [attr-defined]
            ]

        timer_stop = perf_counter()
        time = str(timedelta(seconds=round(timer_stop - timer_start, 0)))
        time_split = time.split(":")
        report.append(f"\n> Time to complete: {time_split[0]} Hours {time_split[1]} Minutes {time_split[2]} Seconds\n")

        return "\n".join(report)

`columns_old` `cached` `property` ¶

Columns in df_old

`columns_new` `cached` `property` ¶

Columns in df_new

`columns_all` `cached` `property` ¶

All columns between df_old and df_new

`columns_added` `cached` `property` ¶

Columns in df_new that are not in df_old

`columns_removed` `cached` `property` ¶

Columns in df_old that are not in df_new

`columns_kept` `cached` `property` ¶

Columns that are in both df_old and df_new

`columns_changed_type` `cached` `property` ¶

Columns in both df_old and df_new for which their 'data_type' attribute changed

`columns_changed_nullable` `cached` `property` ¶

Columns in both df_old and df_new for which their 'nullable' attribute changed

`columns_changed_metadata` `cached` `property` ¶

Columns in both df_old and df_new for which their 'metadata' attribute changed

`columns_comparable` `cached` `property` ¶

Columns that can be compared for regression test. Comparable columns must

be present in df_old and df_new
have the same data_type in df_old and df_new

`columns_diff` `cached` `property` ¶

Columns containing at least one value difference between df_old and df_new

`schema_mutations` `cached` `property` ¶

Detail for all schema mutations between df_old and df_new

`schema_mutations_type` `cached` `property` ¶

Detail for schema mutations of 'data_type' attribute

`schema_mutations_nullable` `cached` `property` ¶

Detail for schema mutations of 'nullable' attribute

`schema_mutations_metadata` `cached` `property` ¶

Detail for schema mutations of 'metadata' attribute

`count_record_old` `cached` `property` ¶

Count of records in df_old

`count_record_new` `cached` `property` ¶

Count of records in df_new

`count_pk_old` `cached` `property` ¶

Count of pks in df_old

`count_pk_new` `cached` `property` ¶

Count of pks in df_new

`count_record_duplicate_old` `cached` `property` ¶

Count of duplicate records in df_old

`count_record_duplicate_new` `cached` `property` ¶

Count of duplicate records in df_new

`count_pk_duplicate_old` `cached` `property` ¶

Count of pks that have duplicates in df_old

`count_pk_duplicate_new` `cached` `property` ¶

Count of pks that have duplicates in df_new

`has_symmetric_duplicates` `cached` `property` ¶

True if duplicates in df_old and df_new are exactly the same.

`count_pk_orphan_old` `cached` `property` ¶

Count of pks that are in df_old but not df_new

`count_pk_orphan_new` `cached` `property` ¶

Count of pks that are in df_new but not df_old

`count_pk_comparable` `cached` `property` ¶

Count of comparable records between df_old and df_new

`count_record_diff` `cached` `property` ¶

Count of records with diffs

`count_pk_diff` `cached` `property` ¶

Count of pks with diffs

`sample_pk_duplicate_old` `cached` `property` ¶

num_sample samples of pks that have duplicate records in df_old

`sample_pk_duplicate_new` `cached` `property` ¶

num_sample samples of pks that have duplicate records in df_new

`sample_pk_orphan_old` `cached` `property` ¶

num_sample samples of pks that are in df_old but not df_new

`sample_pk_orphan_new` `cached` `property` ¶

num_sample samples of pks that are in df_new but not df_old

`df_duplicate_old = self._df_duplicate_old` `instance-attribute` ¶

DataFrame of duplicate pks in old table, with count of duplicates

`df_duplicate_new = self._df_duplicate_new` `instance-attribute` ¶

DataFrame of duplicate pks in new table, with count of duplicates

`df_orphan_old = self._df_orphan_old` `instance-attribute` ¶

DataFrame of orphan pks in old table

`df_orphan_new = self._df_orphan_new` `instance-attribute` ¶

DataFrame of orphan pks in new table

`df_comparable = self._df_comparable` `instance-attribute` ¶

DataFrame of rows that are eligible for regression testing.

Rows are eligible if they:

Are not duplicates
Are not orphans
Have matching primary keys between df_old and df_new

`df_regression = self._df_regression` `instance-attribute` ¶

Same as df_comparable, but with column_map filtered for actual differences and renamed to diffs.

`df_diff_cols = self._df_diff_cols` `instance-attribute` ¶

DataFrame of column names that contain diffs.

`df_diff = self._df_diff` `instance-attribute` ¶

DataFrame that pivots and filters df_regression to show one row per column containing values that are different

`df_diff_summary = self._df_diff_summary` `instance-attribute` ¶

A summary of df_diff, aggregating a count of records/pks

`df_diff_sample = self._df_diff_sample` `instance-attribute` ¶

Same as df_diff, but limited to num_sample rows per column per diff category

`success` `cached` `property` ¶

Whether the table passed regression test.

Regression Tests fail when:

Columns are added
Columns are removed
Columns change data_type
Duplicates exist (and are not symmetrical)
Orphans exist
Value Diffs exist

`summary` `property` ¶

A string-based report that summarizes the results of the Regression Test in Markdown.

RegressionTestResult¶

pyspark_regression.RegressionTest ¶

columns_old cached property ¶

columns_new cached property ¶

columns_all cached property ¶

columns_added cached property ¶

columns_removed cached property ¶

columns_kept cached property ¶

columns_changed_type cached property ¶

columns_changed_nullable cached property ¶

columns_changed_metadata cached property ¶

columns_comparable cached property ¶

columns_diff cached property ¶

schema_mutations cached property ¶

schema_mutations_type cached property ¶

schema_mutations_nullable cached property ¶

schema_mutations_metadata cached property ¶

count_record_old cached property ¶

count_record_new cached property ¶

count_pk_old cached property ¶

count_pk_new cached property ¶

count_record_duplicate_old cached property ¶

count_record_duplicate_new cached property ¶

count_pk_duplicate_old cached property ¶

count_pk_duplicate_new cached property ¶

has_symmetric_duplicates cached property ¶

count_pk_orphan_old cached property ¶

count_pk_orphan_new cached property ¶

count_pk_comparable cached property ¶

count_record_diff cached property ¶

count_pk_diff cached property ¶

sample_pk_duplicate_old cached property ¶

sample_pk_duplicate_new cached property ¶

sample_pk_orphan_old cached property ¶

sample_pk_orphan_new cached property ¶

df_duplicate_old = self._df_duplicate_old instance-attribute ¶

df_duplicate_new = self._df_duplicate_new instance-attribute ¶

df_orphan_old = self._df_orphan_old instance-attribute ¶

df_orphan_new = self._df_orphan_new instance-attribute ¶

df_comparable = self._df_comparable instance-attribute ¶

df_regression = self._df_regression instance-attribute ¶

df_diff_cols = self._df_diff_cols instance-attribute ¶

df_diff = self._df_diff instance-attribute ¶

df_diff_summary = self._df_diff_summary instance-attribute ¶

df_diff_sample = self._df_diff_sample instance-attribute ¶

success cached property ¶

summary property ¶