Skip to content

Computing statistics and p-values

Module containing all the necessary steps to perform statistical analysis.

compute_padj

Module containing the Mixin to compute adjusted p-values.

compute_padj

ComputeAdjustedPValues

Bases: IndependentFiltering, PValueAdjustment

Mixin class to implement the computation of adjusted p-values.

Attributes:

Name Type Description
independent_filter bool

A boolean flag to indicate whether to use independent filtering or not.

Methods:

Name Description
compute_adjusted_p_values

A method to compute adjusted p-values. Runs independent filtering if self.independent_filter is True. Runs BH method otherwise.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/compute_padj.py
class ComputeAdjustedPValues(IndependentFiltering, PValueAdjustment):
    """Mixin class to implement the computation of adjusted p-values.

    Attributes
    ----------
    independent_filter: bool
        A boolean flag to indicate whether to use independent filtering or not.

    Methods
    -------
    compute_adjusted_p_values
        A method to compute adjusted p-values.
        Runs independent filtering if self.independent_filter is True.
        Runs BH method otherwise.

    """

    independent_filter: bool = False

    def compute_adjusted_p_values(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        wald_test_shared_state,
        round_idx,
        clean_models,
    ):
        """Compute adjusted p-values.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: dict
            Local states. Required to propagate intermediate results.

        wald_test_shared_state: dict
            Shared states containing the Wald test results.

        round_idx: int
            The current round.

        clean_models: bool
            If True, the models are cleaned.

        Returns
        -------
        local_states: dict
            Local states. Required to propagate intermediate results.


        round_idx: int
            The updated round index.

        """
        if self.independent_filter:
            local_states, _, round_idx = local_step(
                local_method=self.run_independent_filtering,
                train_data_nodes=train_data_nodes,
                output_local_states=local_states,
                round_idx=round_idx,
                input_local_states=local_states,
                input_shared_state=wald_test_shared_state,
                aggregation_id=aggregation_node.organization_id,
                description="Compute adjusted P values using independent filtering.",
                clean_models=clean_models,
            )
        else:
            local_states, _, round_idx = local_step(
                local_method=self.run_p_value_adjustment,
                train_data_nodes=train_data_nodes,
                output_local_states=local_states,
                round_idx=round_idx,
                input_local_states=local_states,
                input_shared_state=wald_test_shared_state,
                aggregation_id=aggregation_node.organization_id,
                description="Compute adjusted P values using BH method.",
                clean_models=clean_models,
            )

        return local_states, round_idx
compute_adjusted_p_values(train_data_nodes, aggregation_node, local_states, wald_test_shared_state, round_idx, clean_models)

Compute adjusted p-values.

Parameters:

Name Type Description Default
train_data_nodes

List of TrainDataNode.

required
aggregation_node

The aggregation node.

required
local_states

Local states. Required to propagate intermediate results.

required
wald_test_shared_state

Shared states containing the Wald test results.

required
round_idx

The current round.

required
clean_models

If True, the models are cleaned.

required

Returns:

Name Type Description
local_states dict

Local states. Required to propagate intermediate results.

round_idx int

The updated round index.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/compute_padj.py
def compute_adjusted_p_values(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    wald_test_shared_state,
    round_idx,
    clean_models,
):
    """Compute adjusted p-values.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: dict
        Local states. Required to propagate intermediate results.

    wald_test_shared_state: dict
        Shared states containing the Wald test results.

    round_idx: int
        The current round.

    clean_models: bool
        If True, the models are cleaned.

    Returns
    -------
    local_states: dict
        Local states. Required to propagate intermediate results.


    round_idx: int
        The updated round index.

    """
    if self.independent_filter:
        local_states, _, round_idx = local_step(
            local_method=self.run_independent_filtering,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            round_idx=round_idx,
            input_local_states=local_states,
            input_shared_state=wald_test_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Compute adjusted P values using independent filtering.",
            clean_models=clean_models,
        )
    else:
        local_states, _, round_idx = local_step(
            local_method=self.run_p_value_adjustment,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            round_idx=round_idx,
            input_local_states=local_states,
            input_shared_state=wald_test_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Compute adjusted P values using BH method.",
            clean_models=clean_models,
        )

    return local_states, round_idx

substeps

IndependentFiltering

Mixin class implementing independent filtering.

Attributes:

Name Type Description
local_adata AnnData

Local AnnData object.

alpha float

Significance level.

Methods:

Name Description
run_independent_filtering

Run independent filtering on the p-values trend

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py
class IndependentFiltering:
    """Mixin class implementing independent filtering.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    alpha : float
        Significance level.

    Methods
    -------
    run_independent_filtering
        Run independent filtering on the p-values trend
    """

    local_adata: AnnData
    alpha: float

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def run_independent_filtering(self, data_from_opener, shared_state: Any):
        """Run independent filtering on the p-values trend.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state containing the results of the wald tests, namely
            - "p_values" : p-values
            - "wald_statistics" : Wald statistics
            - "wald_se" : Wald standard errors

        """
        p_values = shared_state["p_values"]
        wald_statistics = shared_state["wald_statistics"]
        wald_se = shared_state["wald_se"]

        self.local_adata.varm["p_values"] = p_values
        self.local_adata.varm["wald_statistics"] = wald_statistics
        self.local_adata.varm["wald_se"] = wald_se

        base_mean = self.local_adata.varm["_normed_means"]

        lower_quantile = np.mean(base_mean == 0)

        if lower_quantile < 0.95:
            upper_quantile = 0.95
        else:
            upper_quantile = 1

        theta = np.linspace(lower_quantile, upper_quantile, 50)
        cutoffs = np.quantile(base_mean, theta)

        result = pd.DataFrame(
            np.nan, index=self.local_adata.var_names, columns=np.arange(len(theta))
        )

        for i, cutoff in enumerate(cutoffs):
            use = (base_mean >= cutoff) & (~np.isnan(p_values))
            U2 = p_values[use]
            if not len(U2) == 0:
                result.loc[use, i] = false_discovery_control(U2, method="bh")

        num_rej = (result < self.alpha).sum(0)
        lowess_res = lowess(theta, num_rej, frac=1 / 5)

        if num_rej.max() <= 10:
            j = 0
        else:
            residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0]
            thresh = lowess_res.max() - np.sqrt(np.mean(residual**2))

            if np.any(num_rej > thresh):
                j = np.where(num_rej > thresh)[0][0]
            else:
                j = 0

        self.local_adata.varm["padj"] = result.loc[:, j]
run_independent_filtering(data_from_opener, shared_state)

Run independent filtering on the p-values trend.

Parameters:

Name Type Description Default
data_from_opener AnnData

Not used.

required
shared_state dict

Shared state containing the results of the wald tests, namely - "p_values" : p-values - "wald_statistics" : Wald statistics - "wald_se" : Wald standard errors

required
Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py
@remote_data
@log_remote_data
@reconstruct_adatas
def run_independent_filtering(self, data_from_opener, shared_state: Any):
    """Run independent filtering on the p-values trend.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state containing the results of the wald tests, namely
        - "p_values" : p-values
        - "wald_statistics" : Wald statistics
        - "wald_se" : Wald standard errors

    """
    p_values = shared_state["p_values"]
    wald_statistics = shared_state["wald_statistics"]
    wald_se = shared_state["wald_se"]

    self.local_adata.varm["p_values"] = p_values
    self.local_adata.varm["wald_statistics"] = wald_statistics
    self.local_adata.varm["wald_se"] = wald_se

    base_mean = self.local_adata.varm["_normed_means"]

    lower_quantile = np.mean(base_mean == 0)

    if lower_quantile < 0.95:
        upper_quantile = 0.95
    else:
        upper_quantile = 1

    theta = np.linspace(lower_quantile, upper_quantile, 50)
    cutoffs = np.quantile(base_mean, theta)

    result = pd.DataFrame(
        np.nan, index=self.local_adata.var_names, columns=np.arange(len(theta))
    )

    for i, cutoff in enumerate(cutoffs):
        use = (base_mean >= cutoff) & (~np.isnan(p_values))
        U2 = p_values[use]
        if not len(U2) == 0:
            result.loc[use, i] = false_discovery_control(U2, method="bh")

    num_rej = (result < self.alpha).sum(0)
    lowess_res = lowess(theta, num_rej, frac=1 / 5)

    if num_rej.max() <= 10:
        j = 0
    else:
        residual = num_rej[num_rej > 0] - lowess_res[num_rej > 0]
        thresh = lowess_res.max() - np.sqrt(np.mean(residual**2))

        if np.any(num_rej > thresh):
            j = np.where(num_rej > thresh)[0][0]
        else:
            j = 0

    self.local_adata.varm["padj"] = result.loc[:, j]

PValueAdjustment

Mixin class implementing p-value adjustment.

Attributes:

Name Type Description
local_adata AnnData

Local AnnData object.

Methods:

Name Description
run_p_value_adjustment

Run p-value adjustment on the p-values trend using the Benjamini-Hochberg method.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py
class PValueAdjustment:
    """Mixin class implementing p-value adjustment.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    Methods
    -------
    run_p_value_adjustment
        Run p-value adjustment on the p-values trend using the Benjamini-Hochberg
        method.

    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def run_p_value_adjustment(self, data_from_opener, shared_state: Any):
        """Run p-value adjustment on the p-values trend using the BH method.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state containing the results of the Wald tests, namely
            - "p_values" : p-values, as a numpy array
            - "wald_statistics" : Wald statistics
            - "wald_se" : Wald standard errors

        """
        p_values = shared_state["p_values"]
        wald_statistics = shared_state["wald_statistics"]
        wald_se = shared_state["wald_se"]

        self.local_adata.varm["p_values"] = p_values
        self.local_adata.varm["wald_statistics"] = wald_statistics
        self.local_adata.varm["wald_se"] = wald_se

        padj = pd.Series(np.nan, index=self.local_adata.var_names)
        padj.loc[~np.isnan(p_values)] = false_discovery_control(
            p_values[~np.isnan(p_values)], method="bh"
        )

        self.local_adata.varm["padj"] = padj
run_p_value_adjustment(data_from_opener, shared_state)

Run p-value adjustment on the p-values trend using the BH method.

Parameters:

Name Type Description Default
data_from_opener AnnData

Not used.

required
shared_state dict

Shared state containing the results of the Wald tests, namely - "p_values" : p-values, as a numpy array - "wald_statistics" : Wald statistics - "wald_se" : Wald standard errors

required
Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/compute_padj/substeps.py
@remote_data
@log_remote_data
@reconstruct_adatas
def run_p_value_adjustment(self, data_from_opener, shared_state: Any):
    """Run p-value adjustment on the p-values trend using the BH method.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state containing the results of the Wald tests, namely
        - "p_values" : p-values, as a numpy array
        - "wald_statistics" : Wald statistics
        - "wald_se" : Wald standard errors

    """
    p_values = shared_state["p_values"]
    wald_statistics = shared_state["wald_statistics"]
    wald_se = shared_state["wald_se"]

    self.local_adata.varm["p_values"] = p_values
    self.local_adata.varm["wald_statistics"] = wald_statistics
    self.local_adata.varm["wald_se"] = wald_se

    padj = pd.Series(np.nan, index=self.local_adata.var_names)
    padj.loc[~np.isnan(p_values)] = false_discovery_control(
        p_values[~np.isnan(p_values)], method="bh"
    )

    self.local_adata.varm["padj"] = padj

cooks_filtering

Substep to perform cooks filtering.

cooks_filtering

Module to implement the base Mixin class for Cooks filtering.

CooksFiltering

Bases: LocFindCooksOutliers, AggregateCooksOutliers, LocGetMaxCooks, AggMaxCooks, LocGetMaxCooksCounts, AggMaxCooksCounts, LocCountNumberSamplesAbove, AggCooksFiltering

A class to perform Cooks filtering of p-values.

Methods:

Name Description
cooks_filtering

The method to find Cooks outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/cooks_filtering.py
class CooksFiltering(
    LocFindCooksOutliers,
    AggregateCooksOutliers,
    LocGetMaxCooks,
    AggMaxCooks,
    LocGetMaxCooksCounts,
    AggMaxCooksCounts,
    LocCountNumberSamplesAbove,
    AggCooksFiltering,
):
    """A class to perform Cooks filtering of p-values.

    Methods
    -------
    cooks_filtering
        The method to find Cooks outliers.
    """

    def cooks_filtering(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        wald_test_shared_state,
        round_idx,
        clean_models,
    ):
        """Perform Cooks filtering.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: list[dict]
            Local states. Required to propagate intermediate results.

        wald_test_shared_state : dict
            A shared state containing the Wald test results.
            These results are the following fields:
            - "p_values": p-values of the Wald test.
            - "wald_statistics" : Wald statistics.
            - "wald_se" : Wald standard errors.

        round_idx: int
            Index of the current round.

        clean_models: bool
            Whether to clean the models after the computation.

        Returns
        -------
        local_states: dict
            Local states. The new local state contains Cook's distances.

        shared_state: dict
            A new shared state containing the following fields:
            - "p_values": p-values of the Wald test, updated to be nan for Cook's
            outliers.
            - "wald_statistics" : Wald statistics, for compatibility.
            - "wald_se" : Wald standard errors, for compatibility.

        round_idx: int
            The updated round index.

        """
        local_states, shared_states, round_idx = local_step(
            local_method=self.find_local_cooks_outliers,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=wald_test_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Find local Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        cooks_outliers_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_cooks_outliers,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Find the global Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        local_states, shared_states, round_idx = local_step(
            local_method=self.get_max_local_cooks,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=cooks_outliers_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Get local max cooks distance",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        max_cooks_distance_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_max_cooks,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Get the max cooks distance for the outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        local_states, shared_states, round_idx = local_step(
            local_method=self.get_max_local_cooks_gene_counts,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=max_cooks_distance_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Get the local max gene counts for the Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        max_cooks_gene_counts_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_max_cooks_gene_counts,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Get the max gene counts for the Cook's outliers",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        local_states, shared_states, round_idx = local_step(
            local_method=self.count_local_number_samples_above,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            input_local_states=local_states,
            input_shared_state=max_cooks_gene_counts_shared_state,
            aggregation_id=aggregation_node.organization_id,
            description="Count the number of samples above the max gene counts",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        cooks_filtered_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_cooks_filtering,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            description="Finish Cooks filtering",
            round_idx=round_idx,
            clean_models=clean_models,
        )

        return local_states, cooks_filtered_shared_state, round_idx
cooks_filtering(train_data_nodes, aggregation_node, local_states, wald_test_shared_state, round_idx, clean_models)

Perform Cooks filtering.

Parameters:

Name Type Description Default
train_data_nodes

List of TrainDataNode.

required
aggregation_node

The aggregation node.

required
local_states

Local states. Required to propagate intermediate results.

required
wald_test_shared_state dict

A shared state containing the Wald test results. These results are the following fields: - "p_values": p-values of the Wald test. - "wald_statistics" : Wald statistics. - "wald_se" : Wald standard errors.

required
round_idx

Index of the current round.

required
clean_models

Whether to clean the models after the computation.

required

Returns:

Name Type Description
local_states dict

Local states. The new local state contains Cook's distances.

shared_state dict

A new shared state containing the following fields: - "p_values": p-values of the Wald test, updated to be nan for Cook's outliers. - "wald_statistics" : Wald statistics, for compatibility. - "wald_se" : Wald standard errors, for compatibility.

round_idx int

The updated round index.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/cooks_filtering.py
def cooks_filtering(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    wald_test_shared_state,
    round_idx,
    clean_models,
):
    """Perform Cooks filtering.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: list[dict]
        Local states. Required to propagate intermediate results.

    wald_test_shared_state : dict
        A shared state containing the Wald test results.
        These results are the following fields:
        - "p_values": p-values of the Wald test.
        - "wald_statistics" : Wald statistics.
        - "wald_se" : Wald standard errors.

    round_idx: int
        Index of the current round.

    clean_models: bool
        Whether to clean the models after the computation.

    Returns
    -------
    local_states: dict
        Local states. The new local state contains Cook's distances.

    shared_state: dict
        A new shared state containing the following fields:
        - "p_values": p-values of the Wald test, updated to be nan for Cook's
        outliers.
        - "wald_statistics" : Wald statistics, for compatibility.
        - "wald_se" : Wald standard errors, for compatibility.

    round_idx: int
        The updated round index.

    """
    local_states, shared_states, round_idx = local_step(
        local_method=self.find_local_cooks_outliers,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=wald_test_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Find local Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    cooks_outliers_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_cooks_outliers,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Find the global Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    local_states, shared_states, round_idx = local_step(
        local_method=self.get_max_local_cooks,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=cooks_outliers_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Get local max cooks distance",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    max_cooks_distance_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_max_cooks,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Get the max cooks distance for the outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    local_states, shared_states, round_idx = local_step(
        local_method=self.get_max_local_cooks_gene_counts,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=max_cooks_distance_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Get the local max gene counts for the Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    max_cooks_gene_counts_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_max_cooks_gene_counts,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Get the max gene counts for the Cook's outliers",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    local_states, shared_states, round_idx = local_step(
        local_method=self.count_local_number_samples_above,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        input_local_states=local_states,
        input_shared_state=max_cooks_gene_counts_shared_state,
        aggregation_id=aggregation_node.organization_id,
        description="Count the number of samples above the max gene counts",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    cooks_filtered_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_cooks_filtering,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        description="Finish Cooks filtering",
        round_idx=round_idx,
        clean_models=clean_models,
    )

    return local_states, cooks_filtered_shared_state, round_idx

substeps

AggCooksFiltering

Mixin class to aggregate the cooks filtering.

Methods:

Name Description
agg_cooks_filtering

Aggregate the local number of samples above.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class AggCooksFiltering:
    """Mixin class to aggregate the cooks filtering.

    Methods
    -------
    agg_cooks_filtering
        Aggregate the local number of samples above.

    """

    @remote
    @log_remote
    def agg_cooks_filtering(self, shared_states: list[dict]) -> dict:
        """
        Aggregate the local number of samples above to get cooks filtered genes.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_num_samples_above: np.ndarray of shape (n_genes,)
                The local number of samples above the max cooks gene counts.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - p_values: np.ndarray of shape (n_genes,)
                The p-values from the Wald test.
            - wald_statistics: np.ndarray of shape (n_genes,)
                The Wald statistics from the Wald test.
            - wald_se: np.ndarray of shape (n_genes,)
                The Wald standard errors from the Wald test.

        Returns
        -------
        dict
            A shared state with the following fields:
            - p_values: np.ndarray of shape (n_genes,)
                The p-values from the Wald test with nan for the cooks outliers.
            - wald_statistics: np.ndarray of shape (n_genes,)
                The Wald statistics.
            - wald_se: np.ndarray of shape (n_genes,)
                The Wald standard errors.

        """
        # Find the number of samples with counts above the max cooks
        cooks_outliers = shared_states[0]["cooks_outliers"]

        num_samples_above_max_cooks = np.sum(
            [state["local_num_samples_above"] for state in shared_states], axis=0
        )

        # If that number is greater than 3, set the cooks filter to false
        cooks_outliers[cooks_outliers] = num_samples_above_max_cooks < 3

        # Set the p-values to nan on cooks outliers
        p_values = shared_states[0]["p_values"]
        p_values[cooks_outliers] = np.nan

        return {
            "p_values": p_values,
            "wald_statistics": shared_states[0]["wald_statistics"],
            "wald_se": shared_states[0]["wald_se"],
        }
agg_cooks_filtering(shared_states)

Aggregate the local number of samples above to get cooks filtered genes.

Parameters:

Name Type Description Default
shared_states list[dict]

List of shared states from the local step with the following keys: - local_num_samples_above: np.ndarray of shape (n_genes,) The local number of samples above the max cooks gene counts. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - p_values: np.ndarray of shape (n_genes,) The p-values from the Wald test. - wald_statistics: np.ndarray of shape (n_genes,) The Wald statistics from the Wald test. - wald_se: np.ndarray of shape (n_genes,) The Wald standard errors from the Wald test.

required

Returns:

Type Description
dict

A shared state with the following fields: - p_values: np.ndarray of shape (n_genes,) The p-values from the Wald test with nan for the cooks outliers. - wald_statistics: np.ndarray of shape (n_genes,) The Wald statistics. - wald_se: np.ndarray of shape (n_genes,) The Wald standard errors.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote
@log_remote
def agg_cooks_filtering(self, shared_states: list[dict]) -> dict:
    """
    Aggregate the local number of samples above to get cooks filtered genes.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_num_samples_above: np.ndarray of shape (n_genes,)
            The local number of samples above the max cooks gene counts.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - p_values: np.ndarray of shape (n_genes,)
            The p-values from the Wald test.
        - wald_statistics: np.ndarray of shape (n_genes,)
            The Wald statistics from the Wald test.
        - wald_se: np.ndarray of shape (n_genes,)
            The Wald standard errors from the Wald test.

    Returns
    -------
    dict
        A shared state with the following fields:
        - p_values: np.ndarray of shape (n_genes,)
            The p-values from the Wald test with nan for the cooks outliers.
        - wald_statistics: np.ndarray of shape (n_genes,)
            The Wald statistics.
        - wald_se: np.ndarray of shape (n_genes,)
            The Wald standard errors.

    """
    # Find the number of samples with counts above the max cooks
    cooks_outliers = shared_states[0]["cooks_outliers"]

    num_samples_above_max_cooks = np.sum(
        [state["local_num_samples_above"] for state in shared_states], axis=0
    )

    # If that number is greater than 3, set the cooks filter to false
    cooks_outliers[cooks_outliers] = num_samples_above_max_cooks < 3

    # Set the p-values to nan on cooks outliers
    p_values = shared_states[0]["p_values"]
    p_values[cooks_outliers] = np.nan

    return {
        "p_values": p_values,
        "wald_statistics": shared_states[0]["wald_statistics"],
        "wald_se": shared_states[0]["wald_se"],
    }

AggMaxCooks

Mixin class to aggregate the max cooks distances.

Methods:

Name Description
agg_max_cooks

Aggregate the local max cooks distances.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class AggMaxCooks:
    """Mixin class to aggregate the max cooks distances.

    Methods
    -------
    agg_max_cooks
        Aggregate the local max cooks distances.

    """

    @remote
    @log_remote
    def agg_max_cooks(self, shared_states: list[dict]) -> dict:
        """
        Aggregate the local max cooks.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_max_cooks: np.ndarray of shape (n_genes,)
                The local maximum cooks distance for the outliers.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        Returns
        -------
        shared_state : dict
            Aggregated max cooks.
            It is a dictionary with the following fields:
            - max_cooks: np.ndarray of shape (n_cooks_genes,)
                The maximum cooks distance for the outliers in the aggregated dataset.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
        """
        return {
            "max_cooks": np.max(
                [state["local_max_cooks"] for state in shared_states], axis=0
            ),
            "cooks_outliers": shared_states[0]["cooks_outliers"],
        }
agg_max_cooks(shared_states)

Aggregate the local max cooks.

Parameters:

Name Type Description Default
shared_states list[dict]

List of shared states from the local step with the following keys: - local_max_cooks: np.ndarray of shape (n_genes,) The local maximum cooks distance for the outliers. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

required

Returns:

Name Type Description
shared_state dict

Aggregated max cooks. It is a dictionary with the following fields: - max_cooks: np.ndarray of shape (n_cooks_genes,) The maximum cooks distance for the outliers in the aggregated dataset. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote
@log_remote
def agg_max_cooks(self, shared_states: list[dict]) -> dict:
    """
    Aggregate the local max cooks.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_max_cooks: np.ndarray of shape (n_genes,)
            The local maximum cooks distance for the outliers.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    Returns
    -------
    shared_state : dict
        Aggregated max cooks.
        It is a dictionary with the following fields:
        - max_cooks: np.ndarray of shape (n_cooks_genes,)
            The maximum cooks distance for the outliers in the aggregated dataset.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
    """
    return {
        "max_cooks": np.max(
            [state["local_max_cooks"] for state in shared_states], axis=0
        ),
        "cooks_outliers": shared_states[0]["cooks_outliers"],
    }

AggMaxCooksCounts

Mixin class to aggregate the max cooks gene counts.

Methods:

Name Description
agg_max_cooks_gene_counts

Aggregate the local max cooks gene counts. The goal is to have the gene counts corresponding to the maximum cooks distance for each gene across all datasets.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class AggMaxCooksCounts:
    """Mixin class to aggregate the max cooks gene counts.

    Methods
    -------
    agg_max_cooks_gene_counts
        Aggregate the local max cooks gene counts. The goal is to have the gene
        counts corresponding to the maximum cooks distance for each gene across
        all datasets.

    """

    @remote
    @log_remote
    def agg_max_cooks_gene_counts(self, shared_states: list[dict]) -> dict:
        """
        Aggregate the local max cooks gene counts.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_max_cooks_gene_counts: np.ndarray of shape (n_genes,)
                The local maximum cooks gene counts for the outliers.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
                For each gene, the array contains the gene counts corresponding to the
                maximum cooks distance for that gene across all datasets.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        """
        return {
            "max_cooks_gene_counts": np.ma.stack(
                [state["local_max_cooks_gene_counts"] for state in shared_states],
                axis=0,
            ).min(axis=0),
            "cooks_outliers": shared_states[0]["cooks_outliers"],
        }
agg_max_cooks_gene_counts(shared_states)

Aggregate the local max cooks gene counts.

Parameters:

Name Type Description Default
shared_states list[dict]

List of shared states from the local step with the following keys: - local_max_cooks_gene_counts: np.ndarray of shape (n_genes,) The local maximum cooks gene counts for the outliers. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

required

Returns:

Name Type Description
shared_state dict

A shared state with the following fields: - max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,) For each gene, the array contains the gene counts corresponding to the maximum cooks distance for that gene across all datasets. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote
@log_remote
def agg_max_cooks_gene_counts(self, shared_states: list[dict]) -> dict:
    """
    Aggregate the local max cooks gene counts.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_max_cooks_gene_counts: np.ndarray of shape (n_genes,)
            The local maximum cooks gene counts for the outliers.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
            For each gene, the array contains the gene counts corresponding to the
            maximum cooks distance for that gene across all datasets.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    """
    return {
        "max_cooks_gene_counts": np.ma.stack(
            [state["local_max_cooks_gene_counts"] for state in shared_states],
            axis=0,
        ).min(axis=0),
        "cooks_outliers": shared_states[0]["cooks_outliers"],
    }

AggregateCooksOutliers

Mixin class to aggregate the cooks outliers.

Methods:

Name Description
agg_cooks_outliers

Aggregate the local cooks outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class AggregateCooksOutliers:
    """Mixin class to aggregate the cooks outliers.

    Methods
    -------
    agg_cooks_outliers
        Aggregate the local cooks outliers.

    """

    @remote
    @log_remote
    @prepare_cooks_agg
    def agg_cooks_outliers(self, shared_states: list[dict]) -> dict:
        """
        Aggregate the local cooks outliers.

        Parameters
        ----------
        shared_states : list[dict]
            List of shared states from the local step with the following keys:
            - local_cooks_outliers: np.ndarray of shape (n_genes,)
            - cooks_cutoff: float

        Returns
        -------
        shared_state : dict
            Aggregated cooks outliers.
            It is a dictionary with the following fields:
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier in
                any of the local datasets
            - cooks_cutoff: float
                The cutoff used to define the fact that a gene is a cooks outlier.
        """
        return {
            "cooks_outliers": np.any(
                [state["local_cooks_outliers"] for state in shared_states], axis=0
            ),
            "cooks_cutoff": shared_states[0]["cooks_cutoff"],
        }
agg_cooks_outliers(shared_states)

Aggregate the local cooks outliers.

Parameters:

Name Type Description Default
shared_states list[dict]

List of shared states from the local step with the following keys: - local_cooks_outliers: np.ndarray of shape (n_genes,) - cooks_cutoff: float

required

Returns:

Name Type Description
shared_state dict

Aggregated cooks outliers. It is a dictionary with the following fields: - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier in any of the local datasets - cooks_cutoff: float The cutoff used to define the fact that a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote
@log_remote
@prepare_cooks_agg
def agg_cooks_outliers(self, shared_states: list[dict]) -> dict:
    """
    Aggregate the local cooks outliers.

    Parameters
    ----------
    shared_states : list[dict]
        List of shared states from the local step with the following keys:
        - local_cooks_outliers: np.ndarray of shape (n_genes,)
        - cooks_cutoff: float

    Returns
    -------
    shared_state : dict
        Aggregated cooks outliers.
        It is a dictionary with the following fields:
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier in
            any of the local datasets
        - cooks_cutoff: float
            The cutoff used to define the fact that a gene is a cooks outlier.
    """
    return {
        "cooks_outliers": np.any(
            [state["local_cooks_outliers"] for state in shared_states], axis=0
        ),
        "cooks_cutoff": shared_states[0]["cooks_cutoff"],
    }

LocCountNumberSamplesAbove

Mixin class to count the number of samples above the max cooks gene counts.

Attributes:

Name Type Description
local_adata AnnData

Methods:

Name Description
count_local_number_samples_above

Count the number of samples above the max cooks gene counts.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class LocCountNumberSamplesAbove:
    """Mixin class to count the number of samples above the max cooks gene counts.

    Attributes
    ----------
    local_adata : AnnData

    Methods
    -------
    count_local_number_samples_above
        Count the number of samples above the max cooks gene counts.

    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def count_local_number_samples_above(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Count the number of samples above the max cooks gene counts.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - max_cooks_gene_counts: np.ndarray of shape (n_genes,)
                For each gene, the array contains the gene counts corresponding to the
                maximum cooks distance for that gene across all datasets.

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_num_samples_above: np.ndarray of shape (n_cooks_genes,)
                For each gene, the array contains the number of samples above the
                maximum cooks gene counts.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - p_values: np.ndarray of shape (n_genes,)
                The p-values from the Wald test.
            - wald_statistic: np.ndarray of shape (n_genes,)
                The Wald statistics from the Wald test.
            - wald_se: np.ndarray of shape (n_genes,)
                The Wald standard errors from the Wald test.

        """
        cooks_outliers = shared_state["cooks_outliers"]
        max_cooks_gene_counts = shared_state["max_cooks_gene_counts"]

        num_samples_above = np.sum(
            self.local_adata.X[:, cooks_outliers] > max_cooks_gene_counts, axis=0
        )

        return {
            "local_num_samples_above": num_samples_above,
            "cooks_outliers": cooks_outliers,
            "p_values": self.local_adata.varm["p_values"],
            "wald_statistics": self.local_adata.varm["wald_statistics"],
            "wald_se": self.local_adata.varm["wald_se"],
        }
count_local_number_samples_above(data_from_opener, shared_state)

Count the number of samples above the max cooks gene counts.

Parameters:

Name Type Description Default
data_from_opener AnnData

Not used.

required
shared_state dict

Shared state from the previous step with the following keys: - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - max_cooks_gene_counts: np.ndarray of shape (n_genes,) For each gene, the array contains the gene counts corresponding to the maximum cooks distance for that gene across all datasets.

required

Returns:

Name Type Description
shared_state dict

A shared state with the following fields: - local_num_samples_above: np.ndarray of shape (n_cooks_genes,) For each gene, the array contains the number of samples above the maximum cooks gene counts. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - p_values: np.ndarray of shape (n_genes,) The p-values from the Wald test. - wald_statistic: np.ndarray of shape (n_genes,) The Wald statistics from the Wald test. - wald_se: np.ndarray of shape (n_genes,) The Wald standard errors from the Wald test.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote_data
@log_remote_data
@reconstruct_adatas
def count_local_number_samples_above(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Count the number of samples above the max cooks gene counts.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - max_cooks_gene_counts: np.ndarray of shape (n_genes,)
            For each gene, the array contains the gene counts corresponding to the
            maximum cooks distance for that gene across all datasets.

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_num_samples_above: np.ndarray of shape (n_cooks_genes,)
            For each gene, the array contains the number of samples above the
            maximum cooks gene counts.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - p_values: np.ndarray of shape (n_genes,)
            The p-values from the Wald test.
        - wald_statistic: np.ndarray of shape (n_genes,)
            The Wald statistics from the Wald test.
        - wald_se: np.ndarray of shape (n_genes,)
            The Wald standard errors from the Wald test.

    """
    cooks_outliers = shared_state["cooks_outliers"]
    max_cooks_gene_counts = shared_state["max_cooks_gene_counts"]

    num_samples_above = np.sum(
        self.local_adata.X[:, cooks_outliers] > max_cooks_gene_counts, axis=0
    )

    return {
        "local_num_samples_above": num_samples_above,
        "cooks_outliers": cooks_outliers,
        "p_values": self.local_adata.varm["p_values"],
        "wald_statistics": self.local_adata.varm["wald_statistics"],
        "wald_se": self.local_adata.varm["wald_se"],
    }

LocFindCooksOutliers

Mixin class to find the local cooks outliers.

Attributes:

Name Type Description
local_adata AnnData

Local AnnData object. Is expected to have a "tot_num_samples" key in uns.

refit_cooks bool

Whether to refit the cooks outliers.

Methods:

Name Description
find_local_cooks_outliers

Find the local cooks outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class LocFindCooksOutliers:
    """Mixin class to find the local cooks outliers.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.
        Is expected to have a "tot_num_samples" key in uns.

    refit_cooks : bool
        Whether to refit the cooks outliers.


    Methods
    -------
    find_local_cooks_outliers
        Find the local cooks outliers.

    """

    local_adata: AnnData
    refit_cooks: bool

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    @prepare_cooks_local
    def find_local_cooks_outliers(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Find the local cooks outliers.

        This method is expected to run on the results of the Wald tests.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - p_values: np.ndarray of shape (n_genes,)
            - wald_statistics: np.ndarray of shape (n_genes,)
            - wald_se: np.ndarray of shape (n_genes,)

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - cooks_cutoff: float
                The cutoff used to define the fact that a gene is a cooks outlier.

        """
        # Save these in the local adata
        self.local_adata.varm["p_values"] = shared_state["p_values"]
        self.local_adata.varm["wald_statistics"] = shared_state["wald_statistics"]
        self.local_adata.varm["wald_se"] = shared_state["wald_se"]

        tot_num_samples = self.local_adata.uns["tot_num_samples"]
        num_vars = self.local_adata.uns["n_params"]

        cooks_cutoff = f.ppf(0.99, num_vars, tot_num_samples - num_vars)

        # Take into account whether we already replaced outliers
        cooks_layer = (
            "replace_cooks"
            if self.refit_cooks and self.local_adata.varm["refitted"].sum() > 0
            else "cooks"
        )

        if cooks_layer == "replace_cooks":
            assert "replaced" in self.local_adata.varm.keys()
            replace_cooks = pd.DataFrame(self.local_adata.layers["cooks"].copy())
            replace_cooks.loc[
                self.local_adata.obsm["replaceable"], self.local_adata.varm["refitted"]
            ] = 0.0
            self.local_adata.layers["replace_cooks"] = replace_cooks

        use_for_max = self.local_adata.obs["cells"].apply(
            lambda x: (self.local_adata.uns["num_replicates"] >= 3).loc[x]
        )

        cooks_outliers = (
            (self.local_adata[use_for_max, :].layers[cooks_layer] > cooks_cutoff)
            .any(axis=0)
            .copy()
        )

        return {"local_cooks_outliers": cooks_outliers, "cooks_cutoff": cooks_cutoff}
find_local_cooks_outliers(data_from_opener, shared_state)

Find the local cooks outliers.

This method is expected to run on the results of the Wald tests.

Parameters:

Name Type Description Default
data_from_opener AnnData

Not used.

required
shared_state dict

Shared state from the previous step with the following keys: - p_values: np.ndarray of shape (n_genes,) - wald_statistics: np.ndarray of shape (n_genes,) - wald_se: np.ndarray of shape (n_genes,)

required

Returns:

Name Type Description
shared_state dict

A shared state with the following fields: - local_cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - cooks_cutoff: float The cutoff used to define the fact that a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote_data
@log_remote_data
@reconstruct_adatas
@prepare_cooks_local
def find_local_cooks_outliers(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Find the local cooks outliers.

    This method is expected to run on the results of the Wald tests.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - p_values: np.ndarray of shape (n_genes,)
        - wald_statistics: np.ndarray of shape (n_genes,)
        - wald_se: np.ndarray of shape (n_genes,)

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - cooks_cutoff: float
            The cutoff used to define the fact that a gene is a cooks outlier.

    """
    # Save these in the local adata
    self.local_adata.varm["p_values"] = shared_state["p_values"]
    self.local_adata.varm["wald_statistics"] = shared_state["wald_statistics"]
    self.local_adata.varm["wald_se"] = shared_state["wald_se"]

    tot_num_samples = self.local_adata.uns["tot_num_samples"]
    num_vars = self.local_adata.uns["n_params"]

    cooks_cutoff = f.ppf(0.99, num_vars, tot_num_samples - num_vars)

    # Take into account whether we already replaced outliers
    cooks_layer = (
        "replace_cooks"
        if self.refit_cooks and self.local_adata.varm["refitted"].sum() > 0
        else "cooks"
    )

    if cooks_layer == "replace_cooks":
        assert "replaced" in self.local_adata.varm.keys()
        replace_cooks = pd.DataFrame(self.local_adata.layers["cooks"].copy())
        replace_cooks.loc[
            self.local_adata.obsm["replaceable"], self.local_adata.varm["refitted"]
        ] = 0.0
        self.local_adata.layers["replace_cooks"] = replace_cooks

    use_for_max = self.local_adata.obs["cells"].apply(
        lambda x: (self.local_adata.uns["num_replicates"] >= 3).loc[x]
    )

    cooks_outliers = (
        (self.local_adata[use_for_max, :].layers[cooks_layer] > cooks_cutoff)
        .any(axis=0)
        .copy()
    )

    return {"local_cooks_outliers": cooks_outliers, "cooks_cutoff": cooks_cutoff}

LocGetMaxCooks

Mixin class to get the maximum cooks distance for the outliers.

Attributes:

Name Type Description
local_adata AnnData

Local AnnData object.

Methods:

Name Description
get_max_local_cooks

Get the maximum cooks distance for the outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class LocGetMaxCooks:
    """Mixin class to get the maximum cooks distance for the outliers.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    Methods
    -------
    get_max_local_cooks
        Get the maximum cooks distance for the outliers.

    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def get_max_local_cooks(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Get the maximum cooks distance for the outliers.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.
            - cooks_cutoff: float

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_max_cooks: np.ndarray of shape (n_cooks_genes,)
                The maximum cooks distance for the outliers in the local dataset.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        """
        cooks_outliers = shared_state["cooks_outliers"]
        cooks_cutoff = shared_state["cooks_cutoff"]

        max_cooks = np.max(self.local_adata.layers["cooks"][:, cooks_outliers], axis=0)

        max_cooks[max_cooks <= cooks_cutoff] = 0.0

        max_cooks_idx = self.local_adata.layers["cooks"][:, cooks_outliers].argmax(
            axis=0
        )

        max_cooks_value = self.local_adata.layers["cooks"][:, cooks_outliers][
            max_cooks_idx, np.arange(len(max_cooks))
        ]

        max_cooks_gene_counts = self.local_adata.X[:, cooks_outliers][
            max_cooks_idx, np.arange(len(max_cooks))
        ]

        # Save the max cooks gene counts and max cooks value
        self.local_adata.uns["max_cooks_gene_counts"] = max_cooks_gene_counts
        self.local_adata.uns["max_cooks_value"] = max_cooks_value

        return {
            "local_max_cooks": max_cooks,
            "cooks_outliers": cooks_outliers,
        }
get_max_local_cooks(data_from_opener, shared_state)

Get the maximum cooks distance for the outliers.

Parameters:

Name Type Description Default
data_from_opener AnnData

Not used.

required
shared_state dict

Shared state from the previous step with the following keys: - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier. - cooks_cutoff: float

required

Returns:

Name Type Description
shared_state dict

A shared state with the following fields: - local_max_cooks: np.ndarray of shape (n_cooks_genes,) The maximum cooks distance for the outliers in the local dataset. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote_data
@log_remote_data
@reconstruct_adatas
def get_max_local_cooks(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Get the maximum cooks distance for the outliers.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.
        - cooks_cutoff: float

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_max_cooks: np.ndarray of shape (n_cooks_genes,)
            The maximum cooks distance for the outliers in the local dataset.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    """
    cooks_outliers = shared_state["cooks_outliers"]
    cooks_cutoff = shared_state["cooks_cutoff"]

    max_cooks = np.max(self.local_adata.layers["cooks"][:, cooks_outliers], axis=0)

    max_cooks[max_cooks <= cooks_cutoff] = 0.0

    max_cooks_idx = self.local_adata.layers["cooks"][:, cooks_outliers].argmax(
        axis=0
    )

    max_cooks_value = self.local_adata.layers["cooks"][:, cooks_outliers][
        max_cooks_idx, np.arange(len(max_cooks))
    ]

    max_cooks_gene_counts = self.local_adata.X[:, cooks_outliers][
        max_cooks_idx, np.arange(len(max_cooks))
    ]

    # Save the max cooks gene counts and max cooks value
    self.local_adata.uns["max_cooks_gene_counts"] = max_cooks_gene_counts
    self.local_adata.uns["max_cooks_value"] = max_cooks_value

    return {
        "local_max_cooks": max_cooks,
        "cooks_outliers": cooks_outliers,
    }

LocGetMaxCooksCounts

Mixin class to get the maximum cooks counts for the outliers.

Attributes:

Name Type Description
local_adata AnnData

Local AnnData object.

Methods:

Name Description
get_max_local_cooks_gene_counts

Get the maximum cooks counts for the outliers.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
class LocGetMaxCooksCounts:
    """Mixin class to get the maximum cooks counts for the outliers.

    Attributes
    ----------
    local_adata : AnnData
        Local AnnData object.

    Methods
    -------
    get_max_local_cooks_gene_counts
        Get the maximum cooks counts for the outliers.

    """

    local_adata: AnnData

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    def get_max_local_cooks_gene_counts(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Get the maximum cooks counts for the outliers.

        Parameters
        ----------
        data_from_opener : AnnData
            Not used.

        shared_state : dict
            Shared state from the previous step with the following
            keys:
            - max_cooks: np.ndarray of shape (n_cooks_genes,)
                The maximum cooks distance for the outliers.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        Returns
        -------
        shared_state : dict
            A shared state with the following fields:
            - local_max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
                For each gene, the array contains the gene counts corresponding to the
                maximum cooks distance for that gene if the maximum cooks distance
                in the local dataset is equal to the maximum cooks distance in the
                aggregated dataset, and nan otherwise.
            - cooks_outliers: np.ndarray of shape (n_genes,)
                It is a boolean array indicating whether a gene is a cooks outlier.

        """
        max_cooks = shared_state["max_cooks"]
        cooks_outliers = shared_state["cooks_outliers"]

        max_cooks_gene_counts = self.local_adata.uns["max_cooks_gene_counts"].copy()
        max_cooks_value = self.local_adata.uns["max_cooks_value"].copy()

        # Remove them from the uns field as they are no longer needed
        del self.local_adata.uns["max_cooks_gene_counts"]
        del self.local_adata.uns["max_cooks_value"]

        max_cooks_gene_counts[
            max_cooks_value < max_cooks
        ] = -1  # We can use a < because the count value are non negative integers.

        max_cooks_gene_counts_ma = np.ma.masked_array(
            max_cooks_gene_counts, max_cooks_gene_counts == -1
        )

        return {
            "local_max_cooks_gene_counts": max_cooks_gene_counts_ma,
            "cooks_outliers": cooks_outliers,
        }
get_max_local_cooks_gene_counts(data_from_opener, shared_state)

Get the maximum cooks counts for the outliers.

Parameters:

Name Type Description Default
data_from_opener AnnData

Not used.

required
shared_state dict

Shared state from the previous step with the following keys: - max_cooks: np.ndarray of shape (n_cooks_genes,) The maximum cooks distance for the outliers. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

required

Returns:

Name Type Description
shared_state dict

A shared state with the following fields: - local_max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,) For each gene, the array contains the gene counts corresponding to the maximum cooks distance for that gene if the maximum cooks distance in the local dataset is equal to the maximum cooks distance in the aggregated dataset, and nan otherwise. - cooks_outliers: np.ndarray of shape (n_genes,) It is a boolean array indicating whether a gene is a cooks outlier.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/cooks_filtering/substeps.py
@remote_data
@log_remote_data
@reconstruct_adatas
def get_max_local_cooks_gene_counts(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Get the maximum cooks counts for the outliers.

    Parameters
    ----------
    data_from_opener : AnnData
        Not used.

    shared_state : dict
        Shared state from the previous step with the following
        keys:
        - max_cooks: np.ndarray of shape (n_cooks_genes,)
            The maximum cooks distance for the outliers.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    Returns
    -------
    shared_state : dict
        A shared state with the following fields:
        - local_max_cooks_gene_counts: np.ndarray of shape (n_cooks_genes,)
            For each gene, the array contains the gene counts corresponding to the
            maximum cooks distance for that gene if the maximum cooks distance
            in the local dataset is equal to the maximum cooks distance in the
            aggregated dataset, and nan otherwise.
        - cooks_outliers: np.ndarray of shape (n_genes,)
            It is a boolean array indicating whether a gene is a cooks outlier.

    """
    max_cooks = shared_state["max_cooks"]
    cooks_outliers = shared_state["cooks_outliers"]

    max_cooks_gene_counts = self.local_adata.uns["max_cooks_gene_counts"].copy()
    max_cooks_value = self.local_adata.uns["max_cooks_value"].copy()

    # Remove them from the uns field as they are no longer needed
    del self.local_adata.uns["max_cooks_gene_counts"]
    del self.local_adata.uns["max_cooks_value"]

    max_cooks_gene_counts[
        max_cooks_value < max_cooks
    ] = -1  # We can use a < because the count value are non negative integers.

    max_cooks_gene_counts_ma = np.ma.masked_array(
        max_cooks_gene_counts, max_cooks_gene_counts == -1
    )

    return {
        "local_max_cooks_gene_counts": max_cooks_gene_counts_ma,
        "cooks_outliers": cooks_outliers,
    }

deseq2_stats

DESeq2Stats

Bases: RunWaldTests, CooksFiltering, ComputeAdjustedPValues

Mixin class to compute statistics with DESeq2.

This class encapsulates the Wald tests, the Cooks filtering and the computation of adjusted p-values.

Methods:

Name Description
run_deseq2_stats

Run the DESeq2 statistics pipeline. Performs Wald tests, Cook's filtering and computes adjusted p-values.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/deseq2_stats.py
class DESeq2Stats(RunWaldTests, CooksFiltering, ComputeAdjustedPValues):
    """Mixin class to compute statistics with DESeq2.

    This class encapsulates the Wald tests, the Cooks filtering and the computation
    of adjusted p-values.

    Methods
    -------
    run_deseq2_stats
        Run the DESeq2 statistics pipeline.
        Performs Wald tests, Cook's filtering and computes adjusted p-values.

    """

    cooks_filter: bool

    def run_deseq2_stats(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        round_idx,
        clean_models,
    ):
        """
        Run the DESeq2 statistics pipeline.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: list[dict]
            Local states. Required to propagate intermediate results.

        round_idx: int
            Index of the current round.

        clean_models: bool
            Whether to clean the models after the computation.


        Returns
        -------
        local_states: dict
            Local states.

        round_idx: int
            The updated round index.

        """
        #### Perform Wald tests ####
        logger.info("Running Wald tests.")

        local_states, wald_shared_state, round_idx = self.run_wald_tests(
            train_data_nodes,
            aggregation_node,
            local_states,
            round_idx,
            clean_models=clean_models,
        )

        logger.info("Finished running Wald tests.")

        if self.cooks_filter:
            logger.info("Running Cook's filtering...")
            local_states, wald_shared_state, round_idx = self.cooks_filtering(
                train_data_nodes,
                aggregation_node,
                local_states,
                wald_shared_state,
                round_idx,
                clean_models=clean_models,
            )
            logger.info("Finished running Cook's filtering.")
            logger.info("Computing adjusted p-values...")
        (
            local_states,
            round_idx,
        ) = self.compute_adjusted_p_values(
            train_data_nodes,
            aggregation_node,
            local_states,
            wald_shared_state,
            round_idx,
            clean_models=clean_models,
        )
        logger.info("Finished computing adjusted p-values.")

        return local_states, round_idx

run_deseq2_stats(train_data_nodes, aggregation_node, local_states, round_idx, clean_models)

Run the DESeq2 statistics pipeline.

Parameters:

Name Type Description Default
train_data_nodes

List of TrainDataNode.

required
aggregation_node

The aggregation node.

required
local_states

Local states. Required to propagate intermediate results.

required
round_idx

Index of the current round.

required
clean_models

Whether to clean the models after the computation.

required

Returns:

Name Type Description
local_states dict

Local states.

round_idx int

The updated round index.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/deseq2_stats.py
def run_deseq2_stats(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    round_idx,
    clean_models,
):
    """
    Run the DESeq2 statistics pipeline.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: list[dict]
        Local states. Required to propagate intermediate results.

    round_idx: int
        Index of the current round.

    clean_models: bool
        Whether to clean the models after the computation.


    Returns
    -------
    local_states: dict
        Local states.

    round_idx: int
        The updated round index.

    """
    #### Perform Wald tests ####
    logger.info("Running Wald tests.")

    local_states, wald_shared_state, round_idx = self.run_wald_tests(
        train_data_nodes,
        aggregation_node,
        local_states,
        round_idx,
        clean_models=clean_models,
    )

    logger.info("Finished running Wald tests.")

    if self.cooks_filter:
        logger.info("Running Cook's filtering...")
        local_states, wald_shared_state, round_idx = self.cooks_filtering(
            train_data_nodes,
            aggregation_node,
            local_states,
            wald_shared_state,
            round_idx,
            clean_models=clean_models,
        )
        logger.info("Finished running Cook's filtering.")
        logger.info("Computing adjusted p-values...")
    (
        local_states,
        round_idx,
    ) = self.compute_adjusted_p_values(
        train_data_nodes,
        aggregation_node,
        local_states,
        wald_shared_state,
        round_idx,
        clean_models=clean_models,
    )
    logger.info("Finished computing adjusted p-values.")

    return local_states, round_idx

wald_tests

substeps

AggRunWaldTests

Mixin to run Wald tests.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py
class AggRunWaldTests:
    """Mixin to run Wald tests."""

    lfc_null: float
    alt_hypothesis: Literal["greaterAbs", "lessAbs", "greater", "less"] | None
    num_jobs: int
    joblib_verbosity: int
    joblib_backend: str

    @remote
    @log_remote
    @prepare_cooks_agg
    def agg_run_wald_tests(self, shared_states: list) -> dict:
        """Run the Wald tests.

        Parameters
        ----------
        shared_states : list
            List of shared states containing:
            - local_H_matrix: np.ndarray
                The local H matrix.
            - LFC: np.ndarray
                The log fold changes, in natural log scale.
            - contrast_vector: np.ndarray
                The contrast vector.

        Returns
        -------
        dict
            Contains:
            - p_values: np.ndarray
                The (unadjusted) p-values (n_genes,).
            - wald_statistics: np.ndarray
                The Wald statistics (n_genes,).
            - wald_se: np.ndarray
                The standard errors of the Wald statistics (n_genes,).
        """
        # First step: aggregate the local H matrices

        H = sum([state["local_H_matrix"] for state in shared_states])

        # Second step: compute the Wald tests in parallel
        with parallel_backend(self.joblib_backend):
            wald_test_results = Parallel(
                n_jobs=self.num_jobs, verbose=self.joblib_verbosity
            )(
                delayed(wald_test)(
                    H[i],
                    shared_states[0]["LFC"].values[i],
                    None,
                    shared_states[0]["contrast_vector"],
                    np.log(2) * self.lfc_null,
                    self.alt_hypothesis,
                )
                for i in range(len(H))
            )

        # Finally, unpack the results
        p_values = np.array([r[0] for r in wald_test_results])
        wald_statistics = np.array([r[1] for r in wald_test_results])
        wald_se = np.array([r[2] for r in wald_test_results])

        return {
            "p_values": p_values,
            "wald_statistics": wald_statistics,
            "wald_se": wald_se,
        }
agg_run_wald_tests(shared_states)

Run the Wald tests.

Parameters:

Name Type Description Default
shared_states list

List of shared states containing: - local_H_matrix: np.ndarray The local H matrix. - LFC: np.ndarray The log fold changes, in natural log scale. - contrast_vector: np.ndarray The contrast vector.

required

Returns:

Type Description
dict

Contains: - p_values: np.ndarray The (unadjusted) p-values (n_genes,). - wald_statistics: np.ndarray The Wald statistics (n_genes,). - wald_se: np.ndarray The standard errors of the Wald statistics (n_genes,).

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py
@remote
@log_remote
@prepare_cooks_agg
def agg_run_wald_tests(self, shared_states: list) -> dict:
    """Run the Wald tests.

    Parameters
    ----------
    shared_states : list
        List of shared states containing:
        - local_H_matrix: np.ndarray
            The local H matrix.
        - LFC: np.ndarray
            The log fold changes, in natural log scale.
        - contrast_vector: np.ndarray
            The contrast vector.

    Returns
    -------
    dict
        Contains:
        - p_values: np.ndarray
            The (unadjusted) p-values (n_genes,).
        - wald_statistics: np.ndarray
            The Wald statistics (n_genes,).
        - wald_se: np.ndarray
            The standard errors of the Wald statistics (n_genes,).
    """
    # First step: aggregate the local H matrices

    H = sum([state["local_H_matrix"] for state in shared_states])

    # Second step: compute the Wald tests in parallel
    with parallel_backend(self.joblib_backend):
        wald_test_results = Parallel(
            n_jobs=self.num_jobs, verbose=self.joblib_verbosity
        )(
            delayed(wald_test)(
                H[i],
                shared_states[0]["LFC"].values[i],
                None,
                shared_states[0]["contrast_vector"],
                np.log(2) * self.lfc_null,
                self.alt_hypothesis,
            )
            for i in range(len(H))
        )

    # Finally, unpack the results
    p_values = np.array([r[0] for r in wald_test_results])
    wald_statistics = np.array([r[1] for r in wald_test_results])
    wald_se = np.array([r[2] for r in wald_test_results])

    return {
        "p_values": p_values,
        "wald_statistics": wald_statistics,
        "wald_se": wald_se,
    }

LocBuildContrastVectorHMatrix

Mixin to get compute contrast vectors and local H matrices.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py
class LocBuildContrastVectorHMatrix:
    """Mixin to get compute contrast vectors and local H matrices."""

    local_adata: ad.AnnData
    num_jobs: int
    joblib_verbosity: int
    joblib_backend: str
    irls_batch_size: int

    @remote_data
    @log_remote_data
    @reconstruct_adatas
    @prepare_cooks_local
    def compute_contrast_vector_and_H_matrix(
        self,
        data_from_opener,
        shared_state: dict,
    ) -> dict:
        """Build the contrast vector and the local H matrices.

        Parameters
        ----------
        data_from_opener : ad.AnnData
            AnnData returned by the opener. Not used.

        shared_state : dict
            Not used.

        Returns
        -------
        dict
            Contains:
            - local_H_matrix: np.ndarray
                The local H matrix.
            - LFC: np.ndarray
                The log fold changes, in natural log scale.
            - contrast_vector: np.ndarray
                The contrast vector.
        """
        # Build contrast vector and index
        (
            self.local_adata.uns["contrast_vector"],
            self.local_adata.uns["contrast_idx"],
        ) = build_contrast_vector(
            self.local_adata.uns["contrast"],
            self.local_adata.varm["LFC"].columns,
        )

        # ---- Compute the summands for the covariance matrix ---- #

        with parallel_backend(self.joblib_backend):
            res = Parallel(n_jobs=self.num_jobs, verbose=self.joblib_verbosity)(
                delayed(make_irls_update_summands_and_nll_batch)(
                    self.local_adata.obsm["design_matrix"].values,
                    self.local_adata.obsm["size_factors"],
                    self.local_adata.varm["LFC"][i : i + self.irls_batch_size].values,
                    self.local_adata.varm["dispersions"][i : i + self.irls_batch_size],
                    self.local_adata.X[:, i : i + self.irls_batch_size],
                    0,
                )
                for i in range(0, self.local_adata.n_vars, self.irls_batch_size)
            )

        H = np.concatenate([r[0] for r in res])

        return {
            "local_H_matrix": H,
            "LFC": self.local_adata.varm["LFC"],
            "contrast_vector": self.local_adata.uns["contrast_vector"],
        }
compute_contrast_vector_and_H_matrix(data_from_opener, shared_state)

Build the contrast vector and the local H matrices.

Parameters:

Name Type Description Default
data_from_opener AnnData

AnnData returned by the opener. Not used.

required
shared_state dict

Not used.

required

Returns:

Type Description
dict

Contains: - local_H_matrix: np.ndarray The local H matrix. - LFC: np.ndarray The log fold changes, in natural log scale. - contrast_vector: np.ndarray The contrast vector.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/substeps.py
@remote_data
@log_remote_data
@reconstruct_adatas
@prepare_cooks_local
def compute_contrast_vector_and_H_matrix(
    self,
    data_from_opener,
    shared_state: dict,
) -> dict:
    """Build the contrast vector and the local H matrices.

    Parameters
    ----------
    data_from_opener : ad.AnnData
        AnnData returned by the opener. Not used.

    shared_state : dict
        Not used.

    Returns
    -------
    dict
        Contains:
        - local_H_matrix: np.ndarray
            The local H matrix.
        - LFC: np.ndarray
            The log fold changes, in natural log scale.
        - contrast_vector: np.ndarray
            The contrast vector.
    """
    # Build contrast vector and index
    (
        self.local_adata.uns["contrast_vector"],
        self.local_adata.uns["contrast_idx"],
    ) = build_contrast_vector(
        self.local_adata.uns["contrast"],
        self.local_adata.varm["LFC"].columns,
    )

    # ---- Compute the summands for the covariance matrix ---- #

    with parallel_backend(self.joblib_backend):
        res = Parallel(n_jobs=self.num_jobs, verbose=self.joblib_verbosity)(
            delayed(make_irls_update_summands_and_nll_batch)(
                self.local_adata.obsm["design_matrix"].values,
                self.local_adata.obsm["size_factors"],
                self.local_adata.varm["LFC"][i : i + self.irls_batch_size].values,
                self.local_adata.varm["dispersions"][i : i + self.irls_batch_size],
                self.local_adata.X[:, i : i + self.irls_batch_size],
                0,
            )
            for i in range(0, self.local_adata.n_vars, self.irls_batch_size)
        )

    H = np.concatenate([r[0] for r in res])

    return {
        "local_H_matrix": H,
        "LFC": self.local_adata.varm["LFC"],
        "contrast_vector": self.local_adata.uns["contrast_vector"],
    }

wald_tests

RunWaldTests

Bases: LocBuildContrastVectorHMatrix, AggRunWaldTests

Mixin class to implement the computation of the Wald tests.

Methods:

Name Description
run_wald_tests

The method to compute the Wald tests.

Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/wald_tests.py
class RunWaldTests(LocBuildContrastVectorHMatrix, AggRunWaldTests):
    """Mixin class to implement the computation of the Wald tests.

    Methods
    -------
    run_wald_tests
        The method to compute the Wald tests.
    """

    def run_wald_tests(
        self,
        train_data_nodes,
        aggregation_node,
        local_states,
        round_idx,
        clean_models,
    ):
        """Compute the Wald tests.

        Parameters
        ----------
        train_data_nodes: list
            List of TrainDataNode.

        aggregation_node: AggregationNode
            The aggregation node.

        local_states: dict
            Local states. Required to propagate intermediate results.

        round_idx: int
            Index of the current round.

        clean_models: bool
            Whether to clean the models after the computation.
        """
        # --- Build contrast vectors and compute local H matrices --- #
        local_states, shared_states, round_idx = local_step(
            local_method=self.compute_contrast_vector_and_H_matrix,
            train_data_nodes=train_data_nodes,
            output_local_states=local_states,
            round_idx=round_idx,
            input_local_states=local_states,
            input_shared_state=None,  # TODO plug in previous step
            aggregation_id=aggregation_node.organization_id,
            description="Build contrast vectors and compute local H matrices",
            clean_models=clean_models,
        )

        # --- Aggregate the H matrices and run the Wald tests --- #
        wald_shared_state, round_idx = aggregation_step(
            aggregation_method=self.agg_run_wald_tests,
            train_data_nodes=train_data_nodes,
            aggregation_node=aggregation_node,
            input_shared_states=shared_states,
            round_idx=round_idx,
            description="Run Wald tests.",
            clean_models=clean_models,
        )

        return local_states, wald_shared_state, round_idx
run_wald_tests(train_data_nodes, aggregation_node, local_states, round_idx, clean_models)

Compute the Wald tests.

Parameters:

Name Type Description Default
train_data_nodes

List of TrainDataNode.

required
aggregation_node

The aggregation node.

required
local_states

Local states. Required to propagate intermediate results.

required
round_idx

Index of the current round.

required
clean_models

Whether to clean the models after the computation.

required
Source code in fedpydeseq2/core/deseq2_core/deseq2_stats/wald_tests/wald_tests.py
def run_wald_tests(
    self,
    train_data_nodes,
    aggregation_node,
    local_states,
    round_idx,
    clean_models,
):
    """Compute the Wald tests.

    Parameters
    ----------
    train_data_nodes: list
        List of TrainDataNode.

    aggregation_node: AggregationNode
        The aggregation node.

    local_states: dict
        Local states. Required to propagate intermediate results.

    round_idx: int
        Index of the current round.

    clean_models: bool
        Whether to clean the models after the computation.
    """
    # --- Build contrast vectors and compute local H matrices --- #
    local_states, shared_states, round_idx = local_step(
        local_method=self.compute_contrast_vector_and_H_matrix,
        train_data_nodes=train_data_nodes,
        output_local_states=local_states,
        round_idx=round_idx,
        input_local_states=local_states,
        input_shared_state=None,  # TODO plug in previous step
        aggregation_id=aggregation_node.organization_id,
        description="Build contrast vectors and compute local H matrices",
        clean_models=clean_models,
    )

    # --- Aggregate the H matrices and run the Wald tests --- #
    wald_shared_state, round_idx = aggregation_step(
        aggregation_method=self.agg_run_wald_tests,
        train_data_nodes=train_data_nodes,
        aggregation_node=aggregation_node,
        input_shared_states=shared_states,
        round_idx=round_idx,
        description="Run Wald tests.",
        clean_models=clean_models,
    )

    return local_states, wald_shared_state, round_idx