Computing statistics and p-values

Module containing all the necessary steps to perform statistical analysis.

`compute_padj`

Module containing the Mixin to compute adjusted p-values.

`compute_padj`

`ComputeAdjustedPValues`

Bases: IndependentFiltering, PValueAdjustment

Mixin class to implement the computation of adjusted p-values.

Attributes:

Name	Type	Description
`independent_filter`	`bool`	A boolean flag to indicate whether to use independent filtering or not.

Methods:

Name	Description
`compute_adjusted_p_values`	A method to compute adjusted p-values. Runs independent filtering if self.independent_filter is True. Runs BH method otherwise.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/compute_padj.py

class ComputeAdjustedPValues(IndependentFiltering, PValueAdjustment):
    """Mixin class to implement the computation of adjusted p-values.

    Attributes
    ----------
    independent_filter: bool
        A boolean flag to indicate whether to use independent filtering or not.

    Methods
    -------
    compute_adjusted_p_values
        A method to compute adjusted p-values.
        Runs independent filtering if self.independent_filter is True.
        Runs BH method otherwise.
    """

    independent_filter: bool = False

    @log_organisation_method
    def compute_adjusted_p_values(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        wald_test_shared_state,
        round_idx,
        clean_models,
    ):
        """Compute adjusted p-values.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: dict
            Local states. Required to propagate intermediate results.

        wald_test_shared_state: dict
            Shared states containing the Wald test results.

        round_idx: int
            The current round.

        clean_models: bool
            If True, the models are cleaned.

        Returns
        -------
        local_states: dict
            Local states. Required to propagate intermediate results.


        round_idx: int
            The updated round index.
        """
        if self.independent_filter:
            local_states, _, round_idx = local_step(
                local_method=self.run_independent_filtering,
                train_data_nodes=train_data_nodes,
                output_local_states=local_states,
                round_idx=round_idx,
                input_local_states=local_states,
                input_shared_state=wald_test_shared_state,
                aggregation_id=aggregation_node.organization_id,
                description="Compute adjusted P values using independent filtering.",
                clean_models=clean_models,
            )
        else:
            local_states, _, round_idx = local_step(
                local_method=self.run_p_value_adjustment,
                train_data_nodes=train_data_nodes,
                output_local_states=local_states,
                round_idx=round_idx,
                input_local_states=local_states,
                input_shared_state=wald_test_shared_state,
                aggregation_id=aggregation_node.organization_id,
                description="Compute adjusted P values using BH method.",
                clean_models=clean_models,
            )

        return local_states, round_idx

`compute_adjusted_p_values(train_data_nodes, aggregation_node, local_states, wald_test_shared_state, round_idx, clean_models)`

Compute adjusted p-values.

Parameters:

Name	Description	Default
`train_data_nodes`	List of TrainDataNode.	required
`aggregation_node`	The aggregation node.	required
`local_states`	Local states. Required to propagate intermediate results.	required
`wald_test_shared_state`	Shared states containing the Wald test results.	required
`round_idx`	The current round.	required
`clean_models`	If True, the models are cleaned.	required

Returns:

Name	Type	Description
`local_states`	`dict`	Local states. Required to propagate intermediate results.
`round_idx`	`int`	The updated round index.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/compute_padj.py

@log_organisation_method
def compute_adjusted_p_values(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    wald_test_shared_state,
    round_idx,
    clean_models,
):
    """Compute adjusted p-values.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: dict
        Local states. Required to propagate intermediate results.

    wald_test_shared_state: dict
        Shared states containing the Wald test results.

    round_idx: int
        The current round.

    clean_models: bool
        If True, the models are cleaned.

    Returns
    -------
    local_states: dict
        Local states. Required to propagate intermediate results.


    round_idx: int
        The updated round index.
    """
    if self.independent_filter:
        local_states, _, round_idx = local_step(
            local_method=self.run_independent_filtering,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            round_idx=round_idx,
            input_local_states=local_states,
            input_shared_state=wald_test_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Compute adjusted P values using independent filtering.",
            clean_models=clean_models,
        )
    else:
        local_states, _, round_idx = local_step(
            local_method=self.run_p_value_adjustment,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            round_idx=round_idx,
            input_local_states=local_states,
            input_shared_state=wald_test_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Compute adjusted P values using BH method.",
            clean_models=clean_models,
        )

    return local_states, round_idx

`substeps`

`IndependentFiltering`

Mixin class implementing independent filtering.

Attributes:

Name	Type	Description
`local_adata`	`AnnData`	Local AnnData object.
`alpha`	`float`	Significance level.

Methods:

Name	Description
`run_independent_filtering`	Run independent filtering on the p-values trend

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py

class IndependentFiltering:
    """Mixin class implementing independent filtering.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    alpha : float
        Significance level.

    Methods
    -------
    run_independent_filtering
        Run independent filtering on the p-values trend
    """

    local_adata: AnnData
    alpha: float

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def run_independent_filtering(self, data_from_opener, shared_state: Any):
        """Run independent filtering on the p-values trend.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state containing the results of the wald tests, namely
            - "p_values" : p-values
            - "wald_statistics" : Wald statistics
            - "wald_se" : Wald standard errors
        """
        p_values = shared_state["p_values"]
        wald_statistics = shared_state["wald_statistics"]
        wald_se = shared_state["wald_se"]

        self.local_adata.varm["p_values"] = p_values
        self.local_adata.varm["wald_statistics"] = wald_statistics
        self.local_adata.varm["wald_se"] = wald_se

        base_mean = self.local_adata.varm["_normed_means"]

        lower_quantile = np.mean(base_mean == 0)

        if lower_quantile < 0.95:
            upper_quantile = 0.95
        else:
            upper_quantile = 1

        theta = np.linspace(lower_quantile, upper_quantile, 50)
        cutoffs = np.quantile(base_mean, theta)

        result = pd.DataFrame(
            np.nan, index=self.local_adata.var_names, columns=np.arange(len(theta))
        )

        for i, cutoff in enumerate(cutoffs):
            use = (base_mean >= cutoff) & (~np.isnan(p_values))
            U2 = p_values[use]
            if not len(U2) == 0:
                result.loc[use, i] = false_discovery_control(U2, method="bh")

        num_rej = (result < self.alpha).sum(0)
        lowess_res = lowess(theta, num_rej, frac=1 / 5)

        if num_rej.max() <= 10:
            j = 0
        else:
            residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0]
            thresh = lowess_res.max() - np.sqrt(np.mean(residual**2))

            if np.any(num_rej > thresh):
                j = np.where(num_rej > thresh)[0][0]
            else:
                j = 0

        self.local_adata.varm["padj"] = result.loc[:, j]

`run_independent_filtering(data_from_opener, shared_state)`

Run independent filtering on the p-values trend.

Parameters:

Name	Type	Description	Default
`data_from_opener`	`AnnData`	Not used.	required
`shared_state`	`dict`	Shared state containing the results of the wald tests, namely - "p_values" : p-values - "wald_statistics" : Wald statistics - "wald_se" : Wald standard errors	required

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py

@remote_data
@log_remote_data
@reconstruct_adatas
def run_independent_filtering(self, data_from_opener, shared_state: Any):
    """Run independent filtering on the p-values trend.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state containing the results of the wald tests, namely
        - "p_values" : p-values
        - "wald_statistics" : Wald statistics
        - "wald_se" : Wald standard errors
    """
    p_values = shared_state["p_values"]
    wald_statistics = shared_state["wald_statistics"]
    wald_se = shared_state["wald_se"]

    self.local_adata.varm["p_values"] = p_values
    self.local_adata.varm["wald_statistics"] = wald_statistics
    self.local_adata.varm["wald_se"] = wald_se

    base_mean = self.local_adata.varm["_normed_means"]

    lower_quantile = np.mean(base_mean == 0)

    if lower_quantile < 0.95:
        upper_quantile = 0.95
    else:
        upper_quantile = 1

    theta = np.linspace(lower_quantile, upper_quantile, 50)
    cutoffs = np.quantile(base_mean, theta)

    result = pd.DataFrame(
        np.nan, index=self.local_adata.var_names, columns=np.arange(len(theta))
    )

    for i, cutoff in enumerate(cutoffs):
        use = (base_mean >= cutoff) & (~np.isnan(p_values))
        U2 = p_values[use]
        if not len(U2) == 0:
            result.loc[use, i] = false_discovery_control(U2, method="bh")

    num_rej = (result < self.alpha).sum(0)
    lowess_res = lowess(theta, num_rej, frac=1 / 5)

    if num_rej.max() <= 10:
        j = 0
    else:
        residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0]
        thresh = lowess_res.max() - np.sqrt(np.mean(residual**2))

        if np.any(num_rej > thresh):
            j = np.where(num_rej > thresh)[0][0]
        else:
            j = 0

    self.local_adata.varm["padj"] = result.loc[:, j]

`PValueAdjustment`

Mixin class implementing p-value adjustment.

Attributes:

Name	Type	Description
`local_adata`	`AnnData`	Local AnnData object.

Methods:

Name	Description
`run_p_value_adjustment`	Run p-value adjustment on the p-values trend using the Benjamini-Hochberg method.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py

class PValueAdjustment:
    """Mixin class implementing p-value adjustment.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    Methods
    -------
    run_p_value_adjustment
        Run p-value adjustment on the p-values trend using the Benjamini-Hochberg
        method.
    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def run_p_value_adjustment(self, data_from_opener, shared_state: Any):
        """Run p-value adjustment on the p-values trend using the BH method.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state containing the results of the Wald tests, namely
            - "p_values" : p-values, as a numpy array
            - "wald_statistics" : Wald statistics
            - "wald_se" : Wald standard errors
        """
        p_values = shared_state["p_values"]
        wald_statistics = shared_state["wald_statistics"]
        wald_se = shared_state["wald_se"]

        self.local_adata.varm["p_values"] = p_values
        self.local_adata.varm["wald_statistics"] = wald_statistics
        self.local_adata.varm["wald_se"] = wald_se

        padj = pd.Series(np.nan, index=self.local_adata.var_names)
        padj.loc[~np.isnan(p_values)] = false_discovery_control(
            p_values[~np.isnan(p_values)], method="bh"
        )

        self.local_adata.varm["padj"] = padj

`run_p_value_adjustment(data_from_opener, shared_state)`

Run p-value adjustment on the p-values trend using the BH method.

Parameters:

Name	Type	Description	Default
`data_from_opener`	`AnnData`	Not used.	required
`shared_state`	`dict`	Shared state containing the results of the Wald tests, namely - "p_values" : p-values, as a numpy array - "wald_statistics" : Wald statistics - "wald_se" : Wald standard errors	required

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py

@remote_data
@log_remote_data
@reconstruct_adatas
def run_p_value_adjustment(self, data_from_opener, shared_state: Any):
    """Run p-value adjustment on the p-values trend using the BH method.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state containing the results of the Wald tests, namely
        - "p_values" : p-values, as a numpy array
        - "wald_statistics" : Wald statistics
        - "wald_se" : Wald standard errors
    """
    p_values = shared_state["p_values"]
    wald_statistics = shared_state["wald_statistics"]
    wald_se = shared_state["wald_se"]

    self.local_adata.varm["p_values"] = p_values
    self.local_adata.varm["wald_statistics"] = wald_statistics
    self.local_adata.varm["wald_se"] = wald_se

    padj = pd.Series(np.nan, index=self.local_adata.var_names)
    padj.loc[~np.isnan(p_values)] = false_discovery_control(
        p_values[~np.isnan(p_values)], method="bh"
    )

    self.local_adata.varm["padj"] = padj

`cooks_filtering`

Substep to perform cooks filtering.

`cooks_filtering`

Module to implement the base Mixin class for Cooks filtering.

`CooksFiltering`

Bases: LocFindCooksOutliers, AggregateCooksOutliers, LocGetMaxCooks, AggMaxCooks, LocGetMaxCooksCounts, AggMaxCooksCounts, LocCountNumberSamplesAbove, AggCooksFiltering

A class to perform Cooks filtering of p-values.

Methods:

Name	Description
`cooks_filtering`	The method to find Cooks outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/cooks_filtering.py

class CooksFiltering(
    LocFindCooksOutliers,
    AggregateCooksOutliers,
    LocGetMaxCooks,
    AggMaxCooks,
    LocGetMaxCooksCounts,
    AggMaxCooksCounts,
    LocCountNumberSamplesAbove,
    AggCooksFiltering,
):
    """A class to perform Cooks filtering of p-values.

    Methods
    -------
    cooks_filtering
        The method to find Cooks outliers.
    """

    @log_organisation_method
    def cooks_filtering(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        wald_test_shared_state,
        round_idx,
        clean_models,
    ):
        """Perform Cooks filtering.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: list[dict]
            Local states. Required to propagate intermediate results.

        wald_test_shared_state : dict
            A shared state containing the Wald test results.
            These results are the following fields:
            - "p_values": p-values of the Wald test.
            - "wald_statistics" : Wald statistics.
            - "wald_se" : Wald standard errors.

        round_idx: int
            Index of the current round.

        clean_models: bool
            Whether to clean the models after the computation.

        Returns
        -------
        local_states: dict
            Local states. The new local state contains Cook's distances.

        shared_state: dict
            A new shared state containing the following fields:
            - "p_values": p-values of the Wald test, updated to be nan for Cook's
            outliers.
            - "wald_statistics" : Wald statistics, for compatibility.
            - "wald_se" : Wald standard errors, for compatibility.

        round_idx: int
            The updated round index.
        """
        local_states, shared_states, round_idx = local_step(
            local_method=self.find_local_cooks_outliers,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=wald_test_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Find local Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        cooks_outliers_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_cooks_outliers,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Find the global Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        local_states, shared_states, round_idx = local_step(
            local_method=self.get_max_local_cooks,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=cooks_outliers_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Get local max cooks distance",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        max_cooks_distance_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_max_cooks,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Get the max cooks distance for the outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        local_states, shared_states, round_idx = local_step(
            local_method=self.get_max_local_cooks_gene_counts,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=max_cooks_distance_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Get the local max gene counts for the Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        max_cooks_gene_counts_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_max_cooks_gene_counts,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Get the max gene counts for the Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        local_states, shared_states, round_idx = local_step(
            local_method=self.count_local_number_samples_above,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=max_cooks_gene_counts_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Count the number of samples above the max gene counts",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        cooks_filtered_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_cooks_filtering,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Finish Cooks filtering",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        return local_states, cooks_filtered_shared_state, round_idx

`cooks_filtering(train_data_nodes, aggregation_node, local_states, wald_test_shared_state, round_idx, clean_models)`

Perform Cooks filtering.

Parameters:

Name	Type	Description	Default
`train_data_nodes`		List of TrainDataNode.	required
`aggregation_node`		The aggregation node.	required
`local_states`		Local states. Required to propagate intermediate results.	required
`wald_test_shared_state`	`dict`	A shared state containing the Wald test results. These results are the following fields: - "p_values": p-values of the Wald test. - "wald_statistics" : Wald statistics. - "wald_se" : Wald standard errors.	required
`round_idx`		Index of the current round.	required
`clean_models`		Whether to clean the models after the computation.	required

Returns:

Name	Type	Description
`local_states`	`dict`	Local states. The new local state contains Cook's distances.
`shared_state`	`dict`	A new shared state containing the following fields: - "p_values": p-values of the Wald test, updated to be nan for Cook's outliers. - "wald_statistics" : Wald statistics, for compatibility. - "wald_se" : Wald standard errors, for compatibility.
`round_idx`	`int`	The updated round index.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/cooks_filtering.py

@log_organisation_method
def cooks_filtering(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    wald_test_shared_state,
    round_idx,
    clean_models,
):
    """Perform Cooks filtering.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: list[dict]
        Local states. Required to propagate intermediate results.

    wald_test_shared_state : dict
        A shared state containing the Wald test results.
        These results are the following fields:
        - "p_values": p-values of the Wald test.
        - "wald_statistics" : Wald statistics.
        - "wald_se" : Wald standard errors.

    round_idx: int
        Index of the current round.

    clean_models: bool
        Whether to clean the models after the computation.

    Returns
    -------
    local_states: dict
        Local states. The new local state contains Cook's distances.

    shared_state: dict
        A new shared state containing the following fields:
        - "p_values": p-values of the Wald test, updated to be nan for Cook's
        outliers.
        - "wald_statistics" : Wald statistics, for compatibility.
        - "wald_se" : Wald standard errors, for compatibility.

    round_idx: int
        The updated round index.
    """
    local_states, shared_states, round_idx = local_step(
        local_method=self.find_local_cooks_outliers,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=wald_test_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Find local Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    cooks_outliers_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_cooks_outliers,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Find the global Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    local_states, shared_states, round_idx = local_step(
        local_method=self.get_max_local_cooks,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=cooks_outliers_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Get local max cooks distance",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    max_cooks_distance_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_max_cooks,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Get the max cooks distance for the outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    local_states, shared_states, round_idx = local_step(
        local_method=self.get_max_local_cooks_gene_counts,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=max_cooks_distance_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Get the local max gene counts for the Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    max_cooks_gene_counts_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_max_cooks_gene_counts,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Get the max gene counts for the Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    local_states, shared_states, round_idx = local_step(
        local_method=self.count_local_number_samples_above,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=max_cooks_gene_counts_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Count the number of samples above the max gene counts",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    cooks_filtered_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_cooks_filtering,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Finish Cooks filtering",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    return local_states, cooks_filtered_shared_state, round_idx

`substeps`

`AggCooksFiltering`

Mixin class to aggregate the cooks filtering.

Methods:

Name	Description
`agg_cooks_filtering`	Aggregate the local number of samples above.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class AggCooksFiltering:
    """Mixin class to aggregate the cooks filtering.

    Methods
    -------
    agg_cooks_filtering
        Aggregate the local number of samples above.
    """

    @remote
    @log_remote
    def agg_cooks_filtering(self, shared_states: list[dict]) -> dict:
        """Aggregate the local number of samples above to get cooks filtered genes.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_num_samples_above: np.ndarray of shape (n_genes,)
                The local number of samples above the max cooks gene counts.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - p_values: np.ndarray of shape (n_genes,)
                The p-values from the Wald test.
            - wald_statistics: np.ndarray of shape (n_genes,)
                The Wald statistics from the Wald test.
            - wald_se: np.ndarray of shape (n_genes,)
                The Wald standard errors from the Wald test.

        Returns
        -------
        dict
            A shared state with the following fields:
            - p_values: np.ndarray of shape (n_genes,)
                The p-values from the Wald test with nan for the cooks outliers.
            - wald_statistics: np.ndarray of shape (n_genes,)
                The Wald statistics.
            - wald_se: np.ndarray of shape (n_genes,)
                The Wald standard errors.
        """
        # Find the number of samples with counts above the max cooks
        cooks_outliers = shared_states[0]["cooks_outliers"]

        num_samples_above_max_cooks = np.sum(
            [state["local_num_samples_above"] for state in shared_states], axis=0
        )

        # If that number is greater than 3, set the cooks filter to false
        cooks_outliers[cooks_outliers] = num_samples_above_max_cooks < 3

        # Set the p-values to nan on cooks outliers
        p_values = shared_states[0]["p_values"]
        p_values[cooks_outliers] = np.nan

        return {
            "p_values": p_values,
            "wald_statistics": shared_states[0]["wald_statistics"],
            "wald_se": shared_states[0]["wald_se"],
        }

`agg_cooks_filtering(shared_states)`

Aggregate the local number of samples above to get cooks filtered genes.

Parameters:

Name	Type	Description	Default
`shared_states`	`list[dict]`	List of shared states from the local step with the following keys: - local_num_samples_above: np.ndarray of shape (n_genes,) The local number of samples above the max cooks gene counts. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - p_values: np.ndarray of shape (n_genes,) The p-values from the Wald test. - wald_statistics: np.ndarray of shape (n_genes,) The Wald statistics from the Wald test. - wald_se: np.ndarray of shape (n_genes,) The Wald standard errors from the Wald test.	required

Returns:

Type	Description
`dict`	A shared state with the following fields: - p_values: np.ndarray of shape (n_genes,) The p-values from the Wald test with nan for the cooks outliers. - wald_statistics: np.ndarray of shape (n_genes,) The Wald statistics. - wald_se: np.ndarray of shape (n_genes,) The Wald standard errors.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote
@log_remote
def agg_cooks_filtering(self, shared_states: list[dict]) -> dict:
    """Aggregate the local number of samples above to get cooks filtered genes.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_num_samples_above: np.ndarray of shape (n_genes,)
            The local number of samples above the max cooks gene counts.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - p_values: np.ndarray of shape (n_genes,)
            The p-values from the Wald test.
        - wald_statistics: np.ndarray of shape (n_genes,)
            The Wald statistics from the Wald test.
        - wald_se: np.ndarray of shape (n_genes,)
            The Wald standard errors from the Wald test.

    Returns
    -------
    dict
        A shared state with the following fields:
        - p_values: np.ndarray of shape (n_genes,)
            The p-values from the Wald test with nan for the cooks outliers.
        - wald_statistics: np.ndarray of shape (n_genes,)
            The Wald statistics.
        - wald_se: np.ndarray of shape (n_genes,)
            The Wald standard errors.
    """
    # Find the number of samples with counts above the max cooks
    cooks_outliers = shared_states[0]["cooks_outliers"]

    num_samples_above_max_cooks = np.sum(
        [state["local_num_samples_above"] for state in shared_states], axis=0
    )

    # If that number is greater than 3, set the cooks filter to false
    cooks_outliers[cooks_outliers] = num_samples_above_max_cooks < 3

    # Set the p-values to nan on cooks outliers
    p_values = shared_states[0]["p_values"]
    p_values[cooks_outliers] = np.nan

    return {
        "p_values": p_values,
        "wald_statistics": shared_states[0]["wald_statistics"],
        "wald_se": shared_states[0]["wald_se"],
    }

`AggMaxCooks`

Mixin class to aggregate the max cooks distances.

Methods:

Name	Description
`agg_max_cooks`	Aggregate the local max cooks distances.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class AggMaxCooks:
    """Mixin class to aggregate the max cooks distances.

    Methods
    -------
    agg_max_cooks
        Aggregate the local max cooks distances.
    """

    @remote
    @log_remote
    def agg_max_cooks(self, shared_states: list[dict]) -> dict:
        """Aggregate the local max cooks.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_max_cooks: np.ndarray of shape (n_genes,)
                The local maximum cooks distance for the outliers.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        Returns
        -------
        shared_state : dict
            Aggregated max cooks.
            It is a dictionary with the following fields:
            - max_cooks: np.ndarray of shape (n_cooks_genes,)
                The maximum cooks distance for the outliers in the aggregated dataset.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
        """
        return {
            "max_cooks": np.max(
                [state["local_max_cooks"] for state in shared_states], axis=0
            ),
            "cooks_outliers": shared_states[0]["cooks_outliers"],
        }

`agg_max_cooks(shared_states)`

Aggregate the local max cooks.

Parameters:

Name	Type	Description	Default
`shared_states`	`list[dict]`	List of shared states from the local step with the following keys: - local_max_cooks: np.ndarray of shape (n_genes,) The local maximum cooks distance for the outliers. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.	required

Returns:

Name	Type	Description
`shared_state`	`dict`	Aggregated max cooks. It is a dictionary with the following fields: - max_cooks: np.ndarray of shape (n_cooks_genes,) The maximum cooks distance for the outliers in the aggregated dataset. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote
@log_remote
def agg_max_cooks(self, shared_states: list[dict]) -> dict:
    """Aggregate the local max cooks.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_max_cooks: np.ndarray of shape (n_genes,)
            The local maximum cooks distance for the outliers.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    Returns
    -------
    shared_state : dict
        Aggregated max cooks.
        It is a dictionary with the following fields:
        - max_cooks: np.ndarray of shape (n_cooks_genes,)
            The maximum cooks distance for the outliers in the aggregated dataset.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
    """
    return {
        "max_cooks": np.max(
            [state["local_max_cooks"] for state in shared_states], axis=0
        ),
        "cooks_outliers": shared_states[0]["cooks_outliers"],
    }

`AggMaxCooksCounts`

Mixin class to aggregate the max cooks gene counts.

Methods:

Name	Description
`agg_max_cooks_gene_counts`	Aggregate the local max cooks gene counts. The goal is to have the gene counts corresponding to the maximum cooks distance for each gene across all datasets.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class AggMaxCooksCounts:
    """Mixin class to aggregate the max cooks gene counts.

    Methods
    -------
    agg_max_cooks_gene_counts
        Aggregate the local max cooks gene counts. The goal is to have the gene
        counts corresponding to the maximum cooks distance for each gene across
        all datasets.
    """

    @remote
    @log_remote
    def agg_max_cooks_gene_counts(self, shared_states: list[dict]) -> dict:
        """Aggregate the local max cooks gene counts.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_max_cooks_gene_counts: np.ndarray of shape (n_genes,)
                The local maximum cooks gene counts for the outliers.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
                For each gene, the array contains the gene counts corresponding to the
                maximum cooks distance for that gene across all datasets.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
        """
        return {
            "max_cooks_gene_counts": np.ma.stack(
                [state["local_max_cooks_gene_counts"] for state in shared_states],
                axis=0,
            ).min(axis=0),
            "cooks_outliers": shared_states[0]["cooks_outliers"],
        }

`agg_max_cooks_gene_counts(shared_states)`

Aggregate the local max cooks gene counts.

Parameters:

Name	Type	Description	Default
`shared_states`	`list[dict]`	List of shared states from the local step with the following keys: - local_max_cooks_gene_counts: np.ndarray of shape (n_genes,) The local maximum cooks gene counts for the outliers. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.	required

Returns:

Name	Type	Description
`shared_state`	`dict`	A shared state with the following fields: - max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,) For each gene, the array contains the gene counts corresponding to the maximum cooks distance for that gene across all datasets. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote
@log_remote
def agg_max_cooks_gene_counts(self, shared_states: list[dict]) -> dict:
    """Aggregate the local max cooks gene counts.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_max_cooks_gene_counts: np.ndarray of shape (n_genes,)
            The local maximum cooks gene counts for the outliers.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
            For each gene, the array contains the gene counts corresponding to the
            maximum cooks distance for that gene across all datasets.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
    """
    return {
        "max_cooks_gene_counts": np.ma.stack(
            [state["local_max_cooks_gene_counts"] for state in shared_states],
            axis=0,
        ).min(axis=0),
        "cooks_outliers": shared_states[0]["cooks_outliers"],
    }

`AggregateCooksOutliers`

Mixin class to aggregate the cooks outliers.

Methods:

Name	Description
`agg_cooks_outliers`	Aggregate the local cooks outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class AggregateCooksOutliers:
    """Mixin class to aggregate the cooks outliers.

    Methods
    -------
    agg_cooks_outliers
        Aggregate the local cooks outliers.
    """

    @remote
    @log_remote
    @prepare_cooks_agg
    def agg_cooks_outliers(self, shared_states: list[dict]) -> dict:
        """Aggregate the local cooks outliers.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_cooks_outliers: np.ndarray of shape (n_genes,)
            - cooks_cutoff: float

        Returns
        -------
        shared_state : dict
            Aggregated cooks outliers.
            It is a dictionary with the following fields:
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier in
                any of the local datasets
            - cooks_cutoff: float
                The cutoff used to define the fact that a gene is a cooks outlier.
        """
        return {
            "cooks_outliers": np.any(
                [state["local_cooks_outliers"] for state in shared_states], axis=0
            ),
            "cooks_cutoff": shared_states[0]["cooks_cutoff"],
        }

`agg_cooks_outliers(shared_states)`

Aggregate the local cooks outliers.

Parameters:

Name	Type	Description	Default
`shared_states`	`list[dict]`	List of shared states from the local step with the following keys: - local_cooks_outliers: np.ndarray of shape (n_genes,) - cooks_cutoff: float	required

Returns:

Name	Type	Description
`shared_state`	`dict`	Aggregated cooks outliers. It is a dictionary with the following fields: - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier in any of the local datasets - cooks_cutoff: float The cutoff used to define the fact that a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote
@log_remote
@prepare_cooks_agg
def agg_cooks_outliers(self, shared_states: list[dict]) -> dict:
    """Aggregate the local cooks outliers.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_cooks_outliers: np.ndarray of shape (n_genes,)
        - cooks_cutoff: float

    Returns
    -------
    shared_state : dict
        Aggregated cooks outliers.
        It is a dictionary with the following fields:
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier in
            any of the local datasets
        - cooks_cutoff: float
            The cutoff used to define the fact that a gene is a cooks outlier.
    """
    return {
        "cooks_outliers": np.any(
            [state["local_cooks_outliers"] for state in shared_states], axis=0
        ),
        "cooks_cutoff": shared_states[0]["cooks_cutoff"],
    }

`LocCountNumberSamplesAbove`

Mixin class to count the number of samples above the max cooks gene counts.

Attributes:

Name	Type	Description
`local_adata`	`AnnData`

Methods:

Name	Description
`count_local_number_samples_above`	Count the number of samples above the max cooks gene counts.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class LocCountNumberSamplesAbove:
    """Mixin class to count the number of samples above the max cooks gene counts.

    Attributes
    ----------
    local_adata : AnnData

    Methods
    -------
    count_local_number_samples_above
        Count the number of samples above the max cooks gene counts.
    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def count_local_number_samples_above(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Count the number of samples above the max cooks gene counts.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - max_cooks_gene_counts: np.ndarray of shape (n_genes,)
                For each gene, the array contains the gene counts corresponding to the
                maximum cooks distance for that gene across all datasets.

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_num_samples_above: np.ndarray of shape (n_cooks_genes,)
                For each gene, the array contains the number of samples above the
                maximum cooks gene counts.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - p_values: np.ndarray of shape (n_genes,)
                The p-values from the Wald test.
            - wald_statistic: np.ndarray of shape (n_genes,)
                The Wald statistics from the Wald test.
            - wald_se: np.ndarray of shape (n_genes,)
                The Wald standard errors from the Wald test.
        """
        cooks_outliers = shared_state["cooks_outliers"]
        max_cooks_gene_counts = shared_state["max_cooks_gene_counts"]

        num_samples_above = np.sum(
            self.local_adata.X[:, cooks_outliers] > max_cooks_gene_counts, axis=0
        )

        return {
            "local_num_samples_above": num_samples_above,
            "cooks_outliers": cooks_outliers,
            "p_values": self.local_adata.varm["p_values"],
            "wald_statistics": self.local_adata.varm["wald_statistics"],
            "wald_se": self.local_adata.varm["wald_se"],
        }

`count_local_number_samples_above(data_from_opener, shared_state)`

Count the number of samples above the max cooks gene counts.

Parameters:

Name	Type	Description	Default
`data_from_opener`	`AnnData`	Not used.	required
`shared_state`	`dict`	Shared state from the previous step with the following keys: - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - max_cooks_gene_counts: np.ndarray of shape (n_genes,) For each gene, the array contains the gene counts corresponding to the maximum cooks distance for that gene across all datasets.	required

Returns:

Name	Type	Description
`shared_state`	`dict`	A shared state with the following fields: - local_num_samples_above: np.ndarray of shape (n_cooks_genes,) For each gene, the array contains the number of samples above the maximum cooks gene counts. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - p_values: np.ndarray of shape (n_genes,) The p-values from the Wald test. - wald_statistic: np.ndarray of shape (n_genes,) The Wald statistics from the Wald test. - wald_se: np.ndarray of shape (n_genes,) The Wald standard errors from the Wald test.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote_data
@log_remote_data
@reconstruct_adatas
def count_local_number_samples_above(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Count the number of samples above the max cooks gene counts.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - max_cooks_gene_counts: np.ndarray of shape (n_genes,)
            For each gene, the array contains the gene counts corresponding to the
            maximum cooks distance for that gene across all datasets.

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_num_samples_above: np.ndarray of shape (n_cooks_genes,)
            For each gene, the array contains the number of samples above the
            maximum cooks gene counts.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - p_values: np.ndarray of shape (n_genes,)
            The p-values from the Wald test.
        - wald_statistic: np.ndarray of shape (n_genes,)
            The Wald statistics from the Wald test.
        - wald_se: np.ndarray of shape (n_genes,)
            The Wald standard errors from the Wald test.
    """
    cooks_outliers = shared_state["cooks_outliers"]
    max_cooks_gene_counts = shared_state["max_cooks_gene_counts"]

    num_samples_above = np.sum(
        self.local_adata.X[:, cooks_outliers] > max_cooks_gene_counts, axis=0
    )

    return {
        "local_num_samples_above": num_samples_above,
        "cooks_outliers": cooks_outliers,
        "p_values": self.local_adata.varm["p_values"],
        "wald_statistics": self.local_adata.varm["wald_statistics"],
        "wald_se": self.local_adata.varm["wald_se"],
    }

`LocFindCooksOutliers`

Mixin class to find the local cooks outliers.

Attributes:

Name	Type	Description
`local_adata`	`AnnData`	Local AnnData object. Is expected to have a "tot_num_samples" key in uns.
`refit_cooks`	`bool`	Whether to refit the cooks outliers.

Methods:

Name	Description
`find_local_cooks_outliers`	Find the local cooks outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class LocFindCooksOutliers:
    """Mixin class to find the local cooks outliers.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.
        Is expected to have a "tot_num_samples" key in uns.

    refit_cooks : bool
        Whether to refit the cooks outliers.


    Methods
    -------
    find_local_cooks_outliers
        Find the local cooks outliers.
    """

    local_adata: AnnData
    refit_cooks: bool

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    @prepare_cooks_local
    def find_local_cooks_outliers(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Find the local cooks outliers.

        This method is expected to run on the results of the Wald tests.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - p_values: np.ndarray of shape (n_genes,)
            - wald_statistics: np.ndarray of shape (n_genes,)
            - wald_se: np.ndarray of shape (n_genes,)

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - cooks_cutoff: float
                The cutoff used to define the fact that a gene is a cooks outlier.
        """
        # Save these in the local adata
        self.local_adata.varm["p_values"] = shared_state["p_values"]
        self.local_adata.varm["wald_statistics"] = shared_state["wald_statistics"]
        self.local_adata.varm["wald_se"] = shared_state["wald_se"]

        tot_num_samples = self.local_adata.uns["tot_num_samples"]
        num_vars = self.local_adata.uns["n_params"]

        cooks_cutoff = f.ppf(0.99, num_vars, tot_num_samples - num_vars)

        # Take into account whether we already replaced outliers
        cooks_layer = (
            "replace_cooks"
            if self.refit_cooks and self.local_adata.varm["refitted"].sum() > 0
            else "cooks"
        )

        if cooks_layer == "replace_cooks":
            assert "replaced" in self.local_adata.varm.keys()
            replace_cooks = pd.DataFrame(self.local_adata.layers["cooks"].copy())
            replace_cooks.loc[
                self.local_adata.obsm["replaceable"], self.local_adata.varm["refitted"]
            ] = 0.0
            self.local_adata.layers["replace_cooks"] = replace_cooks

        use_for_max = self.local_adata.obs["cells"].apply(
            lambda x: (self.local_adata.uns["num_replicates"] >= 3).loc[x]
        )

        cooks_outliers = (
            (self.local_adata[use_for_max, :].layers[cooks_layer] > cooks_cutoff)
            .any(axis=0)
            .copy()
        )

        return {"local_cooks_outliers": cooks_outliers, "cooks_cutoff": cooks_cutoff}

`find_local_cooks_outliers(data_from_opener, shared_state)`

Find the local cooks outliers.

This method is expected to run on the results of the Wald tests.

Parameters:

Name	Type	Description	Default
`data_from_opener`	`AnnData`	Not used.	required
`shared_state`	`dict`	Shared state from the previous step with the following keys: - p_values: np.ndarray of shape (n_genes,) - wald_statistics: np.ndarray of shape (n_genes,) - wald_se: np.ndarray of shape (n_genes,)	required

Returns:

Name	Type	Description
`shared_state`	`dict`	A shared state with the following fields: - local_cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - cooks_cutoff: float The cutoff used to define the fact that a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote_data
@log_remote_data
@reconstruct_adatas
@prepare_cooks_local
def find_local_cooks_outliers(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Find the local cooks outliers.

    This method is expected to run on the results of the Wald tests.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - p_values: np.ndarray of shape (n_genes,)
        - wald_statistics: np.ndarray of shape (n_genes,)
        - wald_se: np.ndarray of shape (n_genes,)

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - cooks_cutoff: float
            The cutoff used to define the fact that a gene is a cooks outlier.
    """
    # Save these in the local adata
    self.local_adata.varm["p_values"] = shared_state["p_values"]
    self.local_adata.varm["wald_statistics"] = shared_state["wald_statistics"]
    self.local_adata.varm["wald_se"] = shared_state["wald_se"]

    tot_num_samples = self.local_adata.uns["tot_num_samples"]
    num_vars = self.local_adata.uns["n_params"]

    cooks_cutoff = f.ppf(0.99, num_vars, tot_num_samples - num_vars)

    # Take into account whether we already replaced outliers
    cooks_layer = (
        "replace_cooks"
        if self.refit_cooks and self.local_adata.varm["refitted"].sum() > 0
        else "cooks"
    )

    if cooks_layer == "replace_cooks":
        assert "replaced" in self.local_adata.varm.keys()
        replace_cooks = pd.DataFrame(self.local_adata.layers["cooks"].copy())
        replace_cooks.loc[
            self.local_adata.obsm["replaceable"], self.local_adata.varm["refitted"]
        ] = 0.0
        self.local_adata.layers["replace_cooks"] = replace_cooks

    use_for_max = self.local_adata.obs["cells"].apply(
        lambda x: (self.local_adata.uns["num_replicates"] >= 3).loc[x]
    )

    cooks_outliers = (
        (self.local_adata[use_for_max, :].layers[cooks_layer] > cooks_cutoff)
        .any(axis=0)
        .copy()
    )

    return {"local_cooks_outliers": cooks_outliers, "cooks_cutoff": cooks_cutoff}

`LocGetMaxCooks`

Mixin class to get the maximum cooks distance for the outliers.

Attributes:

Name	Type	Description
`local_adata`	`AnnData`	Local AnnData object.

Methods:

Name	Description
`get_max_local_cooks`	Get the maximum cooks distance for the outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class LocGetMaxCooks:
    """Mixin class to get the maximum cooks distance for the outliers.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    Methods
    -------
    get_max_local_cooks
        Get the maximum cooks distance for the outliers.
    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def get_max_local_cooks(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Get the maximum cooks distance for the outliers.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - cooks_cutoff: float

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_max_cooks: np.ndarray of shape (n_cooks_genes,)
                The maximum cooks distance for the outliers in the local dataset.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
        """
        cooks_outliers = shared_state["cooks_outliers"]
        cooks_cutoff = shared_state["cooks_cutoff"]

        max_cooks = np.max(self.local_adata.layers["cooks"][:, cooks_outliers], axis=0)

        max_cooks[max_cooks <= cooks_cutoff] = 0.0

        max_cooks_idx = self.local_adata.layers["cooks"][:, cooks_outliers].argmax(
            axis=0
        )

        max_cooks_value = self.local_adata.layers["cooks"][:, cooks_outliers][
            max_cooks_idx, np.arange(len(max_cooks))
        ]

        max_cooks_gene_counts = self.local_adata.X[:, cooks_outliers][
            max_cooks_idx, np.arange(len(max_cooks))
        ]

        # Save the max cooks gene counts and max cooks value
        self.local_adata.uns["max_cooks_gene_counts"] = max_cooks_gene_counts
        self.local_adata.uns["max_cooks_value"] = max_cooks_value

        return {
            "local_max_cooks": max_cooks,
            "cooks_outliers": cooks_outliers,
        }

`get_max_local_cooks(data_from_opener, shared_state)`

Get the maximum cooks distance for the outliers.

Parameters:

Name	Type	Description	Default
`data_from_opener`	`AnnData`	Not used.	required
`shared_state`	`dict`	Shared state from the previous step with the following keys: - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - cooks_cutoff: float	required

Returns:

Name	Type	Description
`shared_state`	`dict`	A shared state with the following fields: - local_max_cooks: np.ndarray of shape (n_cooks_genes,) The maximum cooks distance for the outliers in the local dataset. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote_data
@log_remote_data
@reconstruct_adatas
def get_max_local_cooks(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Get the maximum cooks distance for the outliers.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - cooks_cutoff: float

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_max_cooks: np.ndarray of shape (n_cooks_genes,)
            The maximum cooks distance for the outliers in the local dataset.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
    """
    cooks_outliers = shared_state["cooks_outliers"]
    cooks_cutoff = shared_state["cooks_cutoff"]

    max_cooks = np.max(self.local_adata.layers["cooks"][:, cooks_outliers], axis=0)

    max_cooks[max_cooks <= cooks_cutoff] = 0.0

    max_cooks_idx = self.local_adata.layers["cooks"][:, cooks_outliers].argmax(
        axis=0
    )

    max_cooks_value = self.local_adata.layers["cooks"][:, cooks_outliers][
        max_cooks_idx, np.arange(len(max_cooks))
    ]

    max_cooks_gene_counts = self.local_adata.X[:, cooks_outliers][
        max_cooks_idx, np.arange(len(max_cooks))
    ]

    # Save the max cooks gene counts and max cooks value
    self.local_adata.uns["max_cooks_gene_counts"] = max_cooks_gene_counts
    self.local_adata.uns["max_cooks_value"] = max_cooks_value

    return {
        "local_max_cooks": max_cooks,
        "cooks_outliers": cooks_outliers,
    }

`LocGetMaxCooksCounts`

Mixin class to get the maximum cooks counts for the outliers.

Attributes:

Name	Type	Description
`local_adata`	`AnnData`	Local AnnData object.

Methods:

Name	Description
`get_max_local_cooks_gene_counts`	Get the maximum cooks counts for the outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

class LocGetMaxCooksCounts:
    """Mixin class to get the maximum cooks counts for the outliers.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    Methods
    -------
    get_max_local_cooks_gene_counts
        Get the maximum cooks counts for the outliers.
    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def get_max_local_cooks_gene_counts(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Get the maximum cooks counts for the outliers.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - max_cooks: np.ndarray of shape (n_cooks_genes,)
                The maximum cooks distance for the outliers.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
                For each gene, the array contains the gene counts corresponding to the
                maximum cooks distance for that gene if the maximum cooks distance
                in the local dataset is equal to the maximum cooks distance in the
                aggregated dataset, and nan otherwise.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
        """
        max_cooks = shared_state["max_cooks"]
        cooks_outliers = shared_state["cooks_outliers"]

        max_cooks_gene_counts = self.local_adata.uns["max_cooks_gene_counts"].copy()
        max_cooks_value = self.local_adata.uns["max_cooks_value"].copy()

        # Remove them from the uns field as they are no longer needed
        del self.local_adata.uns["max_cooks_gene_counts"]
        del self.local_adata.uns["max_cooks_value"]

        max_cooks_gene_counts[
            max_cooks_value < max_cooks
        ] = -1  # We can use a < because the count value are non negative integers.

        max_cooks_gene_counts_ma = np.ma.masked_array(
            max_cooks_gene_counts, max_cooks_gene_counts == -1
        )

        return {
            "local_max_cooks_gene_counts": max_cooks_gene_counts_ma,
            "cooks_outliers": cooks_outliers,
        }

`get_max_local_cooks_gene_counts(data_from_opener, shared_state)`

Get the maximum cooks counts for the outliers.

Parameters:

Name	Type	Description	Default
`data_from_opener`	`AnnData`	Not used.	required
`shared_state`	`dict`	Shared state from the previous step with the following keys: - max_cooks: np.ndarray of shape (n_cooks_genes,) The maximum cooks distance for the outliers. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.	required

Returns:

Name	Type	Description
`shared_state`	`dict`	A shared state with the following fields: - local_max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,) For each gene, the array contains the gene counts corresponding to the maximum cooks distance for that gene if the maximum cooks distance in the local dataset is equal to the maximum cooks distance in the aggregated dataset, and nan otherwise. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py

@remote_data
@log_remote_data
@reconstruct_adatas
def get_max_local_cooks_gene_counts(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Get the maximum cooks counts for the outliers.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - max_cooks: np.ndarray of shape (n_cooks_genes,)
            The maximum cooks distance for the outliers.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
            For each gene, the array contains the gene counts corresponding to the
            maximum cooks distance for that gene if the maximum cooks distance
            in the local dataset is equal to the maximum cooks distance in the
            aggregated dataset, and nan otherwise.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
    """
    max_cooks = shared_state["max_cooks"]
    cooks_outliers = shared_state["cooks_outliers"]

    max_cooks_gene_counts = self.local_adata.uns["max_cooks_gene_counts"].copy()
    max_cooks_value = self.local_adata.uns["max_cooks_value"].copy()

    # Remove them from the uns field as they are no longer needed
    del self.local_adata.uns["max_cooks_gene_counts"]
    del self.local_adata.uns["max_cooks_value"]

    max_cooks_gene_counts[
        max_cooks_value < max_cooks
    ] = -1  # We can use a < because the count value are non negative integers.

    max_cooks_gene_counts_ma = np.ma.masked_array(
        max_cooks_gene_counts, max_cooks_gene_counts == -1
    )

    return {
        "local_max_cooks_gene_counts": max_cooks_gene_counts_ma,
        "cooks_outliers": cooks_outliers,
    }

`deseq2_stats`

`DESeq2Stats`

Bases: RunWaldTests, CooksFiltering, ComputeAdjustedPValues

Mixin class to compute statistics with DESeq2.

This class encapsulates the Wald tests, the Cooks filtering and the computation of adjusted p-values.

Methods:

Name	Description
`run_deseq2_stats`	Run the DESeq2 statistics pipeline. Performs Wald tests, Cook's filtering and computes adjusted p-values.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/deseq2_stats.py

class DESeq2Stats(RunWaldTests, CooksFiltering, ComputeAdjustedPValues):
    """Mixin class to compute statistics with DESeq2.

    This class encapsulates the Wald tests, the Cooks filtering and the computation
    of adjusted p-values.

    Methods
    -------
    run_deseq2_stats
        Run the DESeq2 statistics pipeline.
        Performs Wald tests, Cook's filtering and computes adjusted p-values.
    """

    cooks_filter: bool

    @log_organisation_method
    def run_deseq2_stats(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        round_idx,
        clean_models,
    ):
        """Run the DESeq2 statistics pipeline.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: list[dict]
            Local states. Required to propagate intermediate results.

        round_idx: int
            Index of the current round.

        clean_models: bool
            Whether to clean the models after the computation.


        Returns
        -------
        local_states: dict
            Local states.

        round_idx: int
            The updated round index.
        """
        #### Perform Wald tests ####
        logger.info("Running Wald tests.")

        local_states, wald_shared_state, round_idx = self.run_wald_tests(
            train_data_nodes,
            aggregation_node,
            local_states,
            round_idx,
            clean_models=clean_models,
        )

        logger.info("Finished running Wald tests.")

        if self.cooks_filter:
            logger.info("Running Cook's filtering...")
            local_states, wald_shared_state, round_idx = self.cooks_filtering(
                train_data_nodes,
                aggregation_node,
                local_states,
                wald_shared_state,
                round_idx,
                clean_models=clean_models,
            )
            logger.info("Finished running Cook's filtering.")
            logger.info("Computing adjusted p-values...")
        (
            local_states,
            round_idx,
        ) = self.compute_adjusted_p_values(
            train_data_nodes,
            aggregation_node,
            local_states,
            wald_shared_state,
            round_idx,
            clean_models=clean_models,
        )
        logger.info("Finished computing adjusted p-values.")

        return local_states, round_idx

`run_deseq2_stats(train_data_nodes, aggregation_node, local_states, round_idx, clean_models)`

Run the DESeq2 statistics pipeline.

Parameters:

Name	Description	Default
`train_data_nodes`	List of TrainDataNode.	required
`aggregation_node`	The aggregation node.	required
`local_states`	Local states. Required to propagate intermediate results.	required
`round_idx`	Index of the current round.	required
`clean_models`	Whether to clean the models after the computation.	required

Returns:

Name	Type	Description
`local_states`	`dict`	Local states.
`round_idx`	`int`	The updated round index.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/deseq2_stats.py

@log_organisation_method
def run_deseq2_stats(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    round_idx,
    clean_models,
):
    """Run the DESeq2 statistics pipeline.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: list[dict]
        Local states. Required to propagate intermediate results.

    round_idx: int
        Index of the current round.

    clean_models: bool
        Whether to clean the models after the computation.


    Returns
    -------
    local_states: dict
        Local states.

    round_idx: int
        The updated round index.
    """
    #### Perform Wald tests ####
    logger.info("Running Wald tests.")

    local_states, wald_shared_state, round_idx = self.run_wald_tests(
        train_data_nodes,
        aggregation_node,
        local_states,
        round_idx,
        clean_models=clean_models,
    )

    logger.info("Finished running Wald tests.")

    if self.cooks_filter:
        logger.info("Running Cook's filtering...")
        local_states, wald_shared_state, round_idx = self.cooks_filtering(
            train_data_nodes,
            aggregation_node,
            local_states,
            wald_shared_state,
            round_idx,
            clean_models=clean_models,
        )
        logger.info("Finished running Cook's filtering.")
        logger.info("Computing adjusted p-values...")
    (
        local_states,
        round_idx,
    ) = self.compute_adjusted_p_values(
        train_data_nodes,
        aggregation_node,
        local_states,
        wald_shared_state,
        round_idx,
        clean_models=clean_models,
    )
    logger.info("Finished computing adjusted p-values.")

    return local_states, round_idx

`wald_tests`

`substeps`

`AggRunWaldTests`

Mixin to run Wald tests.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py

class AggRunWaldTests:
    """Mixin to run Wald tests."""

    lfc_null: float
    alt_hypothesis: Literal["greaterAbs", "lessAbs", "greater", "less"] | None
    num_jobs: int
    joblib_verbosity: int
    joblib_backend: str

    @remote
    @log_remote
    @prepare_cooks_agg
    def agg_run_wald_tests(self, shared_states: list) -> dict:
        """Run the Wald tests.

        Parameters
        ----------
        shared_states : list
            List of shared states containing:
            - local_H_matrix: np.ndarray
                The local H matrix.
            - LFC: np.ndarray
                The log fold changes, in natural log scale.
            - contrast_vector: np.ndarray
                The contrast vector.

        Returns
        -------
        dict
            Contains:
            - p_values: np.ndarray
                The (unadjusted) p-values (n_genes,).
            - wald_statistics: np.ndarray
                The Wald statistics (n_genes,).
            - wald_se: np.ndarray
                The standard errors of the Wald statistics (n_genes,).
        """
        # First step: aggregate the local H matrices

        H = sum([state["local_H_matrix"] for state in shared_states])

        # Second step: compute the Wald tests in parallel
        with parallel_backend(self.joblib_backend):
            wald_test_results = Parallel(
                n_jobs=self.num_jobs, verbose=self.joblib_verbosity
            )(
                delayed(wald_test)(
                    H[i],
                    shared_states[0]["LFC"].values[i],
                    None,
                    shared_states[0]["contrast_vector"],
                    np.log(2) * self.lfc_null,
                    self.alt_hypothesis,
                )
                for i in range(len(H))
            )

        # Finally, unpack the results
        p_values = np.array([r[0] for r in wald_test_results])
        wald_statistics = np.array([r[1] for r in wald_test_results])
        wald_se = np.array([r[2] for r in wald_test_results])

        return {
            "p_values": p_values,
            "wald_statistics": wald_statistics,
            "wald_se": wald_se,
        }

`agg_run_wald_tests(shared_states)`

Run the Wald tests.

Parameters:

Name	Type	Description	Default
`shared_states`	`list`	List of shared states containing: - local_H_matrix: np.ndarray The local H matrix. - LFC: np.ndarray The log fold changes, in natural log scale. - contrast_vector: np.ndarray The contrast vector.	required

Returns:

Type	Description
`dict`	Contains: - p_values: np.ndarray The (unadjusted) p-values (n_genes,). - wald_statistics: np.ndarray The Wald statistics (n_genes,). - wald_se: np.ndarray The standard errors of the Wald statistics (n_genes,).

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py

@remote
@log_remote
@prepare_cooks_agg
def agg_run_wald_tests(self, shared_states: list) -> dict:
    """Run the Wald tests.

    Parameters
    ----------
    shared_states : list
        List of shared states containing:
        - local_H_matrix: np.ndarray
            The local H matrix.
        - LFC: np.ndarray
            The log fold changes, in natural log scale.
        - contrast_vector: np.ndarray
            The contrast vector.

    Returns
    -------
    dict
        Contains:
        - p_values: np.ndarray
            The (unadjusted) p-values (n_genes,).
        - wald_statistics: np.ndarray
            The Wald statistics (n_genes,).
        - wald_se: np.ndarray
            The standard errors of the Wald statistics (n_genes,).
    """
    # First step: aggregate the local H matrices

    H = sum([state["local_H_matrix"] for state in shared_states])

    # Second step: compute the Wald tests in parallel
    with parallel_backend(self.joblib_backend):
        wald_test_results = Parallel(
            n_jobs=self.num_jobs, verbose=self.joblib_verbosity
        )(
            delayed(wald_test)(
                H[i],
                shared_states[0]["LFC"].values[i],
                None,
                shared_states[0]["contrast_vector"],
                np.log(2) * self.lfc_null,
                self.alt_hypothesis,
            )
            for i in range(len(H))
        )

    # Finally, unpack the results
    p_values = np.array([r[0] for r in wald_test_results])
    wald_statistics = np.array([r[1] for r in wald_test_results])
    wald_se = np.array([r[2] for r in wald_test_results])

    return {
        "p_values": p_values,
        "wald_statistics": wald_statistics,
        "wald_se": wald_se,
    }

`LocBuildContrastVectorHMatrix`

Mixin to get compute contrast vectors and local H matrices.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py

class LocBuildContrastVectorHMatrix:
    """Mixin to get compute contrast vectors and local H matrices."""

    local_adata: ad.AnnData
    num_jobs: int
    joblib_verbosity: int
    joblib_backend: str
    irls_batch_size: int

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    @prepare_cooks_local
    def compute_contrast_vector_and_H_matrix(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Build the contrast vector and the local H matrices.

        Parameters
        ----------
        data_from_opener : ad.AnnData
            AnnData returned by the opener. Not used.

        shared_state : dict
            Not used.

        Returns
        -------
        dict
            Contains:
            - local_H_matrix: np.ndarray
                The local H matrix.
            - LFC: np.ndarray
                The log fold changes, in natural log scale.
            - contrast_vector: np.ndarray
                The contrast vector.
        """
        # Build contrast vector and index
        (
            self.local_adata.uns["contrast_vector"],
            self.local_adata.uns["contrast_idx"],
        ) = build_contrast_vector(
            self.local_adata.uns["contrast"],
            self.local_adata.varm["LFC"].columns,
        )

        # ---- Compute the summands for the covariance matrix ---- #

        with parallel_backend(self.joblib_backend):
            res = Parallel(n_jobs=self.num_jobs, verbose=self.joblib_verbosity)(
                delayed(make_irls_update_summands_and_nll_batch)(
                    self.local_adata.obsm["design_matrix"].values,
                    self.local_adata.obsm["size_factors"],
                    self.local_adata.varm["LFC"][i : i + self.irls_batch_size].values,
                    self.local_adata.varm["dispersions"][i : i + self.irls_batch_size],
                    self.local_adata.X[:, i : i + self.irls_batch_size],
                    0,
                )
                for i in range(0, self.local_adata.n_vars, self.irls_batch_size)
            )

        H = np.concatenate([r[0] for r in res])

        return {
            "local_H_matrix": H,
            "LFC": self.local_adata.varm["LFC"],
            "contrast_vector": self.local_adata.uns["contrast_vector"],
        }

`compute_contrast_vector_and_H_matrix(data_from_opener, shared_state)`

Build the contrast vector and the local H matrices.

Parameters:

Name	Type	Description	Default
`data_from_opener`	`AnnData`	AnnData returned by the opener. Not used.	required
`shared_state`	`dict`	Not used.	required

Returns:

Type	Description
`dict`	Contains: - local_H_matrix: np.ndarray The local H matrix. - LFC: np.ndarray The log fold changes, in natural log scale. - contrast_vector: np.ndarray The contrast vector.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py

@remote_data
@log_remote_data
@reconstruct_adatas
@prepare_cooks_local
def compute_contrast_vector_and_H_matrix(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Build the contrast vector and the local H matrices.

    Parameters
    ----------
    data_from_opener : ad.AnnData
        AnnData returned by the opener. Not used.

    shared_state : dict
        Not used.

    Returns
    -------
    dict
        Contains:
        - local_H_matrix: np.ndarray
            The local H matrix.
        - LFC: np.ndarray
            The log fold changes, in natural log scale.
        - contrast_vector: np.ndarray
            The contrast vector.
    """
    # Build contrast vector and index
    (
        self.local_adata.uns["contrast_vector"],
        self.local_adata.uns["contrast_idx"],
    ) = build_contrast_vector(
        self.local_adata.uns["contrast"],
        self.local_adata.varm["LFC"].columns,
    )

    # ---- Compute the summands for the covariance matrix ---- #

    with parallel_backend(self.joblib_backend):
        res = Parallel(n_jobs=self.num_jobs, verbose=self.joblib_verbosity)(
            delayed(make_irls_update_summands_and_nll_batch)(
                self.local_adata.obsm["design_matrix"].values,
                self.local_adata.obsm["size_factors"],
                self.local_adata.varm["LFC"][i : i + self.irls_batch_size].values,
                self.local_adata.varm["dispersions"][i : i + self.irls_batch_size],
                self.local_adata.X[:, i : i + self.irls_batch_size],
                0,
            )
            for i in range(0, self.local_adata.n_vars, self.irls_batch_size)
        )

    H = np.concatenate([r[0] for r in res])

    return {
        "local_H_matrix": H,
        "LFC": self.local_adata.varm["LFC"],
        "contrast_vector": self.local_adata.uns["contrast_vector"],
    }

`wald_tests`

`RunWaldTests`

Bases: LocBuildContrastVectorHMatrix, AggRunWaldTests

Mixin class to implement the computation of the Wald tests.

Methods:

Name	Description
`run_wald_tests`	The method to compute the Wald tests.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/wald_tests.py

class RunWaldTests(LocBuildContrastVectorHMatrix, AggRunWaldTests):
    """Mixin class to implement the computation of the Wald tests.

    Methods
    -------
    run_wald_tests
        The method to compute the Wald tests.
    """

    @log_organisation_method
    def run_wald_tests(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        round_idx,
        clean_models,
    ):
        """Compute the Wald tests.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: dict
            Local states. Required to propagate intermediate results.

        round_idx: int
            Index of the current round.

        clean_models: bool
            Whether to clean the models after the computation.
        """
        # --- Build contrast vectors and compute local H matrices --- #
        local_states, shared_states, round_idx = local_step(
            local_method=self.compute_contrast_vector_and_H_matrix,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            round_idx=round_idx,
            input_local_states=local_states,
            input_shared_state=None,  # TODO plug in previous step
            aggregation_id=aggregation_node.organization_id,
            description="Build contrast vectors and compute local H matrices",
            clean_models=clean_models,
        )

        # --- Aggregate the H matrices and run the Wald tests --- #
        wald_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_run_wald_tests,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            round_idx=round_idx,
            description="Run Wald tests.",
            clean_models=clean_models,
        )

        return local_states, wald_shared_state, round_idx

`run_wald_tests(train_data_nodes, aggregation_node, local_states, round_idx, clean_models)`

Compute the Wald tests.

Parameters:

Name	Description	Default
`train_data_nodes`	List of TrainDataNode.	required
`aggregation_node`	The aggregation node.	required
`local_states`	Local states. Required to propagate intermediate results.	required
`round_idx`	Index of the current round.	required
`clean_models`	Whether to clean the models after the computation.	required

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/wald_tests.py

@log_organisation_method
def run_wald_tests(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    round_idx,
    clean_models,
):
    """Compute the Wald tests.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: dict
        Local states. Required to propagate intermediate results.

    round_idx: int
        Index of the current round.

    clean_models: bool
        Whether to clean the models after the computation.
    """
    # --- Build contrast vectors and compute local H matrices --- #
    local_states, shared_states, round_idx = local_step(
        local_method=self.compute_contrast_vector_and_H_matrix,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        round_idx=round_idx,
        input_local_states=local_states,
        input_shared_state=None,  # TODO plug in previous step
        aggregation_id=aggregation_node.organization_id,
        description="Build contrast vectors and compute local H matrices",
        clean_models=clean_models,
    )

    # --- Aggregate the H matrices and run the Wald tests --- #
    wald_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_run_wald_tests,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        round_idx=round_idx,
        description="Run Wald tests.",
        clean_models=clean_models,
    )

    return local_states, wald_shared_state, round_idx