Skip to content

Commit 055bb89

Browse files
mckornfieldHenriqueTolentino
authored andcommitted
add collab links pointing to main and 101 notebook (#587)
1 parent b225eac commit 055bb89

File tree

5 files changed

+989
-0
lines changed

5 files changed

+989
-0
lines changed
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "d97df5a0",
6+
"metadata": {},
7+
"source": [
8+
"<a target=\"_parent\" href=\"https://colab.research.google.com/github/gretelai/gretel-blueprints/blob/main/docs/notebooks/safe-synthetics/free-text-transform-synthesize-dp.ipynb\">\n",
9+
" <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
10+
"</a>"
11+
]
12+
},
13+
{
14+
"cell_type": "code",
15+
"execution_count": null,
16+
"metadata": {
17+
"id": "ubmyh3IVoL7w"
18+
},
19+
"outputs": [],
20+
"source": [
21+
"%%capture\n",
22+
"\n",
23+
"%pip install git+https://github.com/gretelai/gretel-python-client.git@main"
24+
]
25+
},
26+
{
27+
"cell_type": "code",
28+
"execution_count": null,
29+
"metadata": {
30+
"id": "JF2cRncBoT1P"
31+
},
32+
"outputs": [],
33+
"source": [
34+
"from gretel_client.navigator_client import Gretel\n",
35+
"from rich.console import Console\n",
36+
"\n",
37+
"gretel = Gretel(api_key=\"prompt\", endpoint=\"https://api.dev.gretel.ai\")\n",
38+
"console = Console()"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": null,
44+
"metadata": {
45+
"id": "4KULZsmkowgk"
46+
},
47+
"outputs": [],
48+
"source": [
49+
"from google.colab import drive\n",
50+
"drive.mount('/content/drive')\n",
51+
"\n",
52+
"#ds = \"/content/drive/My Drive/credit_card_transaction_1k.csv\"\n",
53+
"#ds = \"/content/drive/My Drive/hipaa_patients.csv\"\n",
54+
"ds = \"/content/drive/My Drive/ecommerce_customers.csv\"\n",
55+
"\n",
56+
"import pandas as pd\n",
57+
"#ds = \"https://raw.githubusercontent.com/gretelai/gretel-blueprints/main/sample_data/sample-patient-events.csv\"\n",
58+
"df = pd.read_csv(ds)\n",
59+
"\n",
60+
"print(f\"Number of rows: {len(df)}\")\n",
61+
"df.head()"
62+
]
63+
},
64+
{
65+
"cell_type": "code",
66+
"execution_count": null,
67+
"metadata": {
68+
"id": "NHD2Ny15o5ZF"
69+
},
70+
"outputs": [],
71+
"source": [
72+
"gdpr_safe_config_yaml = \"\"\"\n",
73+
"globals:\n",
74+
" classify:\n",
75+
" enable: true\n",
76+
" entities:\n",
77+
" # True identifiers (also part of HIPAA)\n",
78+
" - first_name\n",
79+
" - last_name\n",
80+
" - name\n",
81+
" - street_address\n",
82+
" - city\n",
83+
" - state\n",
84+
" - postcode\n",
85+
" - country\n",
86+
" - address\n",
87+
" - latitude\n",
88+
" - longitude\n",
89+
" - coordinate\n",
90+
" - age\n",
91+
" - phone_number\n",
92+
" - fax_number\n",
93+
" - email\n",
94+
" - ssn\n",
95+
" - unique_identifier\n",
96+
" - medical_record_number\n",
97+
" - health_plan_beneficiary_number\n",
98+
" - account_number\n",
99+
" - certificate_license_number\n",
100+
" - vehicle_identifier\n",
101+
" - license_plate\n",
102+
" - device_identifier\n",
103+
" - biometric_identifier\n",
104+
" - url\n",
105+
" - ipv4\n",
106+
" - ipv6\n",
107+
"\n",
108+
" # True identifiers (in addition to HIPAA)\n",
109+
" - national_id\n",
110+
" - tax_id\n",
111+
" - bank_routing_number\n",
112+
" - swift_bic\n",
113+
" - credit_debit_card\n",
114+
" - cvv\n",
115+
" - pin\n",
116+
" - employee_id\n",
117+
" - api_key\n",
118+
" - coordinate\n",
119+
" - customer_id\n",
120+
" - user_name\n",
121+
" - password\n",
122+
" - mac_address\n",
123+
" - http_cookie\n",
124+
"\n",
125+
" # Quasi identifiers (also part of HIPAA)\n",
126+
" - date\n",
127+
" - date_time\n",
128+
"\n",
129+
" # Quasi identifiers (in addition to HIPAA)\n",
130+
" - blood_type\n",
131+
" - gender\n",
132+
" - sexuality\n",
133+
" - political_view\n",
134+
" - race\n",
135+
" - ethnicity\n",
136+
" - religious_belief\n",
137+
" - language\n",
138+
" - education\n",
139+
" - job_title\n",
140+
" - employment_status\n",
141+
" - company_name\n",
142+
" ner:\n",
143+
" ner_threshold: 0.7\n",
144+
" locales: [en_US]\n",
145+
"steps:\n",
146+
" - vars:\n",
147+
" row_seed: random.random()\n",
148+
" rows:\n",
149+
" update:\n",
150+
" - condition: column.entity == \"first_name\" and not (this | isna)\n",
151+
" value: fake.persona(row_index=vars.row_seed + index).first_name\n",
152+
" - condition: column.entity == \"last_name\" and not (this | isna)\n",
153+
" value: fake.persona(row_index=vars.row_seed + index).last_name\n",
154+
" - condition: column.entity == \"name\" and not (this | isna)\n",
155+
" value: column.entity | fake\n",
156+
" - condition: (column.entity == \"street_address\" or column.entity == \"city\" or column.entity == \"state\" or column.entity == \"postcode\" or column.entity == \"address\") and not (this | isna)\n",
157+
" value: column.entity | fake\n",
158+
" - condition: column.entity == \"latitude\" and not (this | isna)\n",
159+
" value: fake.location_on_land()[0]\n",
160+
" - condition: column.entity == \"longitude\" and not (this | isna)\n",
161+
" value: fake.location_on_land()[1]\n",
162+
" - condition: column.entity == \"coordinate\" and not (this | isna)\n",
163+
" value: fake.location_on_land()\n",
164+
" - condition: column.entity == \"email\" and not (this | isna)\n",
165+
" value: fake.persona(row_index=vars.row_seed + index).email\n",
166+
" - condition: column.entity == \"ssn\" and not (this | isna)\n",
167+
" value: column.entity | fake\n",
168+
" - condition: column.entity == \"phone_number\" and not (this | isna)\n",
169+
" value: (fake.random_number(digits=3) | string) + \"-\" + (fake.random_number(digits=3) | string) + \"-\" + (fake.random_number(digits=4) | string)\n",
170+
" - condition: column.entity == \"fax_number\" and not (this | isna)\n",
171+
" value: (fake.random_number(digits=3) | string) + \"-\" + (fake.random_number(digits=3) |\n",
172+
" string) + \"-\" + (fake.random_number(digits=4) | string)\n",
173+
" - condition: column.entity == \"vehicle_identifier\" and not (this | isna)\n",
174+
" value: fake.vin()\n",
175+
" - condition: column.entity == \"license_plate\" and not (this | isna)\n",
176+
" value: column.entity | fake\n",
177+
" - condition: (column.entity == \"unique_identifier\" or column.entity == \"medical_record_number\" or column.entity == \"health_plan_beneficiary_number\" or column.entity == \"account_number\" or column.entity == \"certificate_license_number\" or column.entity == \"device_identifier\" or column.entity == \"biometric_identifier\" or column.entity == \"bank_routing_number\" or column.entity == \"swift_bic\" or column.entity == \"employee_id\" or column.entity == \"api_key\" or column.entity == \"customer_id\" or column.entity == \"user_name\" or column.entity == \"password\" or column.entity == \"http_cookie\") and not (this | isna)\n",
178+
" value: fake.bothify(re.sub(\"\\\\d\", \"#\", re.sub(\"[A-Z]\", \"?\", (this | string))))\n",
179+
" - condition: (column.entity == \"url\" or column.entity == \"ipv4\" or column.entity == \"ipv6\") and not (this | isna)\n",
180+
" value: column.entity | fake\n",
181+
" - condition: c(olumn.entity == \"national_id\" or column.entity == \"tax_id\") and not (this | isna)\n",
182+
" value: fake.itin()\n",
183+
" - condition: column.entity == \"credit_debit_card\" and not (this | isna)\n",
184+
" value: fake.credit_card_number()\n",
185+
" - condition: column.entity == \"cvv\" and not (this | isna)\n",
186+
" value: fake.credit_card_security_code()\n",
187+
" - condition: column.entity == \"pin\" and not (this | isna)\n",
188+
" value: fake.random_number(digits=4) | string\n",
189+
" - condition: column.entity == \"coordinate\" and not (this | isna)\n",
190+
" value: column.entity | fake\n",
191+
" - condition: column.entity == \"mac_address\" and not (this | isna)\n",
192+
" value: column.entity | fake\n",
193+
"\n",
194+
" - condition: column.entity is none and column.type == \"text\"\n",
195+
" value: this | fake_entities\n",
196+
"\"\"\"\n"
197+
]
198+
},
199+
{
200+
"cell_type": "code",
201+
"execution_count": null,
202+
"metadata": {
203+
"id": "5jQAciloopLn"
204+
},
205+
"outputs": [],
206+
"source": [
207+
"tabular_ft_config = {\n",
208+
" \"train\": {\n",
209+
" \"params\": {\n",
210+
" \"num_input_records_to_sample\": 5000\n",
211+
" },\n",
212+
" \"privacy_params\": {\n",
213+
" \"dp\": \"false\"\n",
214+
" }\n",
215+
" }\n",
216+
"}\n",
217+
"\n",
218+
"\n",
219+
"import yaml\n",
220+
"\n",
221+
"synthetic_dataset = gretel.safe_synthetic_dataset\\\n",
222+
" .from_data_source(df) \\\n",
223+
" .transform(yaml.safe_load(gdpr_safe_config_yaml)) \\\n",
224+
" .synthesize(\"tabular_ft\", tabular_ft_config, num_records=1000) \\\n",
225+
" .create()"
226+
]
227+
},
228+
{
229+
"cell_type": "code",
230+
"execution_count": null,
231+
"metadata": {
232+
"id": "GDOmyMKVSSrU"
233+
},
234+
"outputs": [],
235+
"source": [
236+
"synthetic_dataset.dataset.df.head()"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": null,
242+
"metadata": {
243+
"id": "TvXGWJpLSTWJ"
244+
},
245+
"outputs": [],
246+
"source": [
247+
"synthetic_dataset.report.table"
248+
]
249+
},
250+
{
251+
"cell_type": "code",
252+
"execution_count": null,
253+
"metadata": {
254+
"id": "8Ue-7rS4DCEt"
255+
},
256+
"outputs": [],
257+
"source": [
258+
"import IPython\n",
259+
"IPython.display.HTML(str(synthetic_dataset.download_report(format=\"html\").read().decode('utf-8')), metadata=dict(isolated=True))"
260+
]
261+
}
262+
],
263+
"metadata": {
264+
"colab": {
265+
"provenance": []
266+
},
267+
"kernelspec": {
268+
"display_name": "Python 3",
269+
"name": "python3"
270+
},
271+
"language_info": {
272+
"name": "python"
273+
}
274+
},
275+
"nbformat": 4,
276+
"nbformat_minor": 0
277+
}

0 commit comments

Comments
 (0)