0%

SecretFlow PSI tests

SecretFlow隐语PSI实验环境配置与测试

配置SecretFlow 并测试两方PSI的效率

官方文档 隐语PSI Benchmark白皮书 — SecretFlow 文档

配置环境

配置conda

1
2
3
4
5
6
7
8
9
10
sudo apt-get install wget
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
# choose miniconda3 location
~/.miniconda3
Do you wish the installer to initialize Miniconda3 by running conda init? [yes|no]
[no] >>> yes
source ~/.bashrc

conda --version

新建conda环境

1
2
3
4
5
conda create -n sf-benchmark python=3.8
conda activate sf-benchmark
pip install -U secretflow
mkdir sf-benchmark
cd sf-benchmark

生成.csv集合数据脚本,调用

1
python3 gene_psi.py 1000000

脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# python3 gene_psi.py
from random import randint
from random import sample
import csv
import sys


def random_with_N_digits(n):
range_start = 10 ** (n - 1)
range_end = (10**n) - 1
return randint(range_start, range_end)


row_list = []
len1 = 10**2
len2 = 10
len3 = 10
len4 = 10

if len(sys.argv) > 1:
len1 = int(sys.argv[1])
len2 = int(len1 / 2)

if len(sys.argv) > 2:
len3 = int(sys.argv[2])

len4 = int(len3 / 2)
print(len1, len2)


for i in range(len1):
data_list = [random_with_N_digits(38)] # should use 10^38 equals 2^128
row_list.append(data_list)

row_list2 = sample(row_list, len2)
for i in range(len2, len1):
data_list = [random_with_N_digits(38)]
row_list2.append(data_list)

row_list3 = sample(row_list, len4)
for i in range(len4, len3):
data_list = [random_with_N_digits(38)]
row_list3.append(data_list)

print(len(row_list2))
print(len(row_list3))

with open('psi_1.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["id"])
writer.writerows(row_list)

with open('psi_2.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["id"])
writer.writerows(row_list2)

with open('psi_3.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["id"])
writer.writerows(row_list3)

测试安装是否成功

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# python3 foot.py
import secretflow as sf
sf.init(['alice', 'bob', 'carol'], address='local')
dev = sf.PYU('alice')

import numpy as np
from sklearn.datasets import load_iris

data, target = load_iris(return_X_y=True, as_frame=True)
data['uid'] = np.arange(len(data)).astype('str')
data['month'] = ['Jan'] * (2^20) + ['Feb'] * (2^20)

# generate data, notice the content ./data
import os

os.makedirs('./data', exist_ok=True)

da, db, dc = data.sample(frac=0.9), data.sample(frac=0.8), data.sample(frac=0.7)

da.to_csv('data/alice.csv', index=False)
db.to_csv('data/bob.csv', index=False)
dc.to_csv('data/carol.csv', index=False)

alice, bob = sf.PYU('alice'), sf.PYU('bob')
spu = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))

input_path = {alice: 'data/alice.csv', bob: 'data/bob.csv'}
output_path = {alice: 'data/alice_psi.csv', bob: 'data/bob_psi.csv'}
spu.psi_csv('uid', input_path, output_path, 'alice')

import pandas as pd

df = da.join(db.set_index('uid'), on='uid', how='inner', rsuffix='_bob', sort=True)
expected = df[da.columns].astype({'uid': 'int64'}).reset_index(drop=True)

da_psi = pd.read_csv('data/alice_psi.csv')
db_psi = pd.read_csv('data/bob_psi.csv')

pd.testing.assert_frame_equal(da_psi, expected)
pd.testing.assert_frame_equal(db_psi, expected)

print(da_psi)

实际实验测试脚本

启动节点

1
2
3
RAY_DISABLE_REMOTE_CODE=true 
ray start --head --node-ip-address="192.168.31.128" --port="9394" --resources='{"alice": 2}' --include-dashboard=False
ray start --address="192.168.31.128:9394" --resources='{"bob": 2}'

通过更换reports的protocol参数进行3个两方PSI协议的测试,脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# python3 benchmark.py
import sys
import time
import logging

from absl import app
import spu
import secretflow as sf

# init log
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

# SPU settings
cluster_def = {
'nodes': [
# listen_address and address should have the same port
# <<< !!! >>> replace <192.168.0.1:12945> to alice node's local ip & free port
# 192.168.31.128 is my own ip address
{'party': 'alice', 'id': 'local:0', 'address': '192.168.31.128:12345', 'listen_address': '0.0.0.0:12345'},
# <<< !!! >>> replace <192.168.0.2:12946> to bob node's local ip & free port
{'party': 'bob', 'id': 'local:1', 'address': '192.168.31.128:12333', 'listen_address': '0.0.0.0:12333'},
# <<< !!! >>> if you need 3pc test, please add node here, for example, add carol as rank 2
# {'party': 'carol', 'id': 'local:2', 'address': '127.0.0.1:12347'},
],
'runtime_config': {
'protocol': spu.spu_pb2.SEMI2K,
'field': spu.spu_pb2.FM128,
},
}

def main(_):

# sf init
# <<< !!! >>> replace <192.168.0.1:9394> to your ray head
# two parties secretflow>=0.7.18b3 need to add <sf.init(parties=['alice', 'bob']>
sf.init(parties=['alice', 'bob'], address='192.168.31.128:9394',log_to_driver=True)
alice = sf.PYU('alice')
bob = sf.PYU('bob')
# carol = sf.PYU('carol')

# <<< !!! >>> replace path to real parties local file path.
input_path = {
alice: './psi_1.csv',
bob: './psi_2.csv',
# if run with `ECDH_PSI_3PC`, add carol
# carol: '/data/psi_3.csv',
}
output_path = {
alice: './psi_output.csv',
bob: './psi_output.csv',
# if run with `ECDH_PSI_3PC`, add carol
# carol: '/data/psi_output.csv',
}
select_keys = {
alice: ['id'],
bob: ['id'],
# if run with `ECDH_PSI_3PC`, add carol
# carol: ['id'],
}
spu = sf.SPU(cluster_def)
# spu = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))
# prepare data
start = time.time()

reports = spu.psi_csv(
key=select_keys,
input_path=input_path,
output_path=output_path,
receiver='alice', # if `broadcast_result=False`, only receiver can get output file.
# all the possible protocols: ECDH_PSI_2PC KKRT_PSI_2PC BC22_PSI_2PC ECDH_PSI_3PC
protocol='ECDH_PSI_2PC', # psi protocol
precheck_input=False, # will cost ext time if set True
sort=False, # will cost ext time if set True
broadcast_result=False, # will cost ext time if set True
)
print(f"psi reports: {reports}")
logging.info(f"cost time: {time.time() - start}") # units: second

sf.shutdown()

if __name__ == '__main__':
app.run(main)

SKY-PSI测试

./bin/PSI_test -r 0 -ss 20 -rs 20 -w 621 -h 20 -hash 10 & ./bin/PSI_test -r 1 -ss 20 -rs 20 -w 621 -h 20 -hash 10

./bin/PSI_test -r 0 -ss 22 -rs 22 -w 627 -h 22 -hash 10 & ./bin/PSI_test -r 1 -ss 22 -rs 22 -w 627 -h 22 -hash 10

./bin/PSI_test -r 0 -ss 23 -rs 23 -w 630 -h 23 -hash 10 & ./bin/PSI_test -r 1 -ss 23 -rs 23 -w 630 -h 23 -hash 10

./bin/PSI_test -r 0 -ss 24 -rs 24 -w 633 -h 24 -hash 11 & ./bin/PSI_test -r 1 -ss 24 -rs 24 -w 633 -h 24 -hash 11

./bin/PSI_test -r 0 -ss 25 -rs 25 -w 636 -h 25 -hash 11 & ./bin/PSI_test -r 1 -ss 25 -rs 25 -w 636 -h 25 -hash 11

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
alleysira@ubuntu:~/SKY-PSI$ cmake .
alleysira@ubuntu:~/SKY-PSI$ make
alleysira@ubuntu:~/SKY-PSI$ ./bin/PSI_test -r 0 -ss 20 -rs 20 -w 621 -h 20 -hash 10 -ip 127.0.0.1 -ck 0 -gm 0 & ./bin/PSI_test -r 1 -ss 20 -rs 20 -w 621 -h 20 -hash 10 -ip 127.0.0.1 -ck 0 -gm 0
alleysira@ubuntu:~/SKY-PSI$ ./bin/PSI_test -r 0 -ss 20 -rs 20 -w 621 -h 20 -hash 10 -ip 127.0.0.1 -ck 0 -gm 0 & ./bin/PSI_test -r 1 -ss 20 -rs 20 -w 621 -h 20 -hash 10 -ip 127.0.0.1 -ck 0 -gm 0
[1] 4988
Receiver matrix sent and transposed hash input computed
Label Time (ms) diff (ms)
__________________________________
Sender base OT finished 155.1 155.071 ******
Sender set transformed 645.4 490.352 *******
Sender transposed hash input computed 6863.1 6217.686 **********
Sender hash outputs computed and sent 8659.7 1796.587 *********
Receiver intersection computed,Intersection size:500000
Label Time (ms) diff (ms)
__________________________________
Receiver base OT finished 135.9 135.929 ******
Receiver initialized 216.6 80.709 *****
Receiver set transformed 620.4 403.792 *******
Receiver matrix sent and transposed hash input computed 6829.3 6208.850 **********
Receiver intersection computed 9250.8 2421.508 *********
Receiver sent communication: 77.631 MB
Receiver received communication: 4.813 MB
Receiver total communication: 82.445 MB
[1]+ Done ./bin/PSI_test -r 0 -ss 20 -rs 20 -w 621 -h 20 -hash 10 -ip 127.0.0.1 -ck 0 -gm 0

cm20 支持到$2^{22},2^{23} $

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
alleysira@ubuntu:~/OPRF-PSI-backup$ ./bin/PSI_test -r 0 -ss 22 -rs 22 -w 627 -h 22 -hash 11  & ./bin/PSI_test -r 1 -ss 22 -rs 22 -w 627 -h 22 -hash 11 -ip 127.0.0.1
[2] 5249
Receiver matrix sent and transposed hash input computed
Label Time (ms) diff (ms)
__________________________________
Sender base OT finished 688.9 688.936 ******
Sender set transformed 2053.2 1364.259 *******
Sender transposed hash input computed 28745.7 26692.532 **********
Sender hash outputs computed and sent 38459.4 9713.711 *********
Begin
Receiver intersection computed,Intersection size:532 Proportion of intersection:0.382% correct!
Label Time (ms) diff (ms)
__________________________________
Receiver base OT finished 690.1 690.110 ******
Receiver initialized 874.8 184.698 *****
Receiver set transformed 2005.0 1130.157 *******
Receiver matrix sent and transposed hash input computed 28747.9 26742.969 **********
Receiver intersection computed 41609.8 12861.827 *********
Receiver sent communication: 313.506 MB
Receiver received communication: 10.016 MB
Receiver total communication: 323.522 MB

alleysira@ubuntu:~/OPRF-PSI-backup$ ./bin/PSI_test -r 0 -ss 23 -rs 23 -w 627 -h 23 -hash 11 & ./bin/PSI_test -r 1 -ss 23 -rs 23 -w 627 -h 23 -hash 11 -ip 127.0.0.1
[1] 5273
Receiver matrix sent and transposed hash input computed
Label Time (ms) diff (ms)
__________________________________
Sender base OT finished 150.6 150.579 *****
Sender set transformed 2884.4 2733.777 *******
Sender transposed hash input computed 67371.3 64486.946 **********
Sender hash outputs computed and sent 90574.5 23203.154 *********
Begin
Receiver intersection computed,Intersection size:536 Proportion of intersection:0.191% correct!
Label Time (ms) diff (ms)
__________________________________
Receiver base OT finished 148.4 148.435 *****
Receiver initialized 517.2 368.812 *****
Receiver set transformed 2835.7 2318.477 *******
Receiver matrix sent and transposed hash input computed 67371.8 64536.073 **********
Receiver intersection computed 97432.1 30060.339 *********
Receiver sent communication: 627.006 MB
Receiver received communication: 20.016 MB
Receiver total communication: 647.022 MB

实验环境

4Core/12GB

hard drive: 30GB,实验过程中硬盘空间不足,可能影响效率

cm20的元素大小为$2^{128}$,隐语默认为$10^{18}$(已修改为$10^{38}$)

cm20设置了交集个数为100(已修正为50 0000),隐语交集个数设置为集合大小一半

分别限制网络带宽为30Mbps 20ms、100Mbps 20ms和LAN进行测试,对secretflow的测试需要限制环回地址lo

1
2
3
4
5
6
7
8
9
10
ifconfig 

#100Mbps 20ms
sudo tc qdisc add dev lo root handle 1: tbf rate 100mbit burst 256kb latency 800ms
sudo tc qdisc add dev lo parent 1:1 handle 10: netem delay 20msec limit 8000

#清除限制
sudo tc qdisc del dev lo root
#查看已有配置
sudo tc qdisc show dev lo

30Mbps 20ms结果

实验结果

数量级 KKRT/s ECDH/s BC22/s SKY-PSI/s
百万 1000000 $2^{20}$ 40.00+40.81=40.405 159.15+161.22=160.185 58.23+60.14=59.18 26.47+26.76+26.81=26.68
千万 10000000 369.75 1558.04 537.13

100Mbps 20ms结果

1
2
3
4
5
(base) jie@jie-virtual-machine:~$ sudo  tc qdisc add dev lo root handle 1: tbf rate 100mbit burst 256kb latency 800ms 
(base) jie@jie-virtual-machine:~$ sudo tc qdisc add dev lo parent 1:1 handle 10: netem delay 20msec limit 8000
(base) jie@jie-virtual-machine:~$ sudo tc qdisc show dev lo
qdisc tbf 1: root refcnt 2 rate 100Mbit burst 256Kb lat 800ms
qdisc netem 10: parent 1:1 limit 8000 delay 20ms
数量级 KKRT/s ECDH/s BC22/s SKY-PSI/s
百万 1000000 $2^{20}$ 18.80+18.29=18.545 151.927 16.86+16.176=16.518 10.52+10.51+ 10.57=10.53
千万 10000000 138.21+139.47=138.84 1544.09 106.89+111.60=109.245

LAN实验结果

数量级 KKRT/s ECDH/s BC22/s SKY-PSI/s
百万 1000000 $2^{20}$ 12.85+10.65+12.63+10.19+13.11+9.57+9.82+9.68+9.93=9.84 137.29+142.37+141.59+147.97+143.38=142.52 10.33+10.318=10.324 9.2508+9.002+10.941+9.0912+9.036+9.663+10.935+10.154=9.75
千万 10000000 76.24+72.58+74.83+70.75+70.83=73.046 1374.78 79.94+78.78+81.13+78.40+78.62=79.374

蚂蚁给出的结果

img