## Sample-and-Aggregate algorithm using our DP Mean and Gaussian Mechanism


In [None]:
import numpy as np
import pandas as pd

# established in previous lectures
from mock_dp_library import *

Read in the data.  We're going to use the PUMS dataset we are familiar with, and focus on the education variable, a 16 point scale.

In [None]:
import pandas as pd
data = pd.read_csv(
    "https://raw.githubusercontent.com/opendp/cs208/main/spring2022/data/FultonPUMS5full.csv")

# define public information
n = len(data)            # in this case, dataset length is considered public, and is not protected
educ_bounds = (1., 16.)  # easily guessable without looking at the data

educ = data['educ'].values.astype(float)
print(release_dp_mean(educ, bounds=educ_bounds, epsilon=1.))

10.610644633036072


In [None]:
def sample_aggregate(data: pd.DataFrame, function, partition_count: int, bounds, epsilon, delta):

    ## SAMPLE
    # shuffle without replacement
    data = data.sample(frac=1, replace=False)
    # split data into `partition_count` datasets
    partitions = np.array_split(data, partition_count)

    ## EVALUATE
    results = []
    for partition in partitions:
        results.append(function(partition))

    ## AGGREGATE
    private_release = release_dp_mean(
        results, bounds=bounds, epsilon=epsilon, delta=delta, mechanism="gaussian")
    
    return(private_release)

In [None]:
def correlation(data):
    return np.corrcoef(data['educ'], data['income'])[0, 1]


dp_correlation = sample_aggregate(
    data, correlation, partition_count=200, bounds=[0,1], epsilon=1, delta=1e-6)

print("True correlation:", correlation(data))
print("DP   correlation:", dp_correlation)

True correlation: 0.35472882626591723
DP   correlation: 0.370176414211767
