Skip to content

Commit 9777b17

Browse files
authored
Merge pull request #28 from redhuntlabs/saudi-pii-pack
Saudi pii pack
2 parents 9b266f5 + 0ec2088 commit 9777b17

File tree

4 files changed

+65
-3
lines changed

4 files changed

+65
-3
lines changed

definitions.json

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,60 @@
151151
"<<<<"
152152
]
153153
},
154+
"Resident Identity (Iqama)": {
155+
"regex":null,
156+
"region":"Saudi Arabia",
157+
"keywords":[
158+
"Kingdom",
159+
"Saudi",
160+
"Arabia",
161+
"Ministry",
162+
"Interior",
163+
"Permit",
164+
"Iqama",
165+
"Residen",
166+
"Identity"
167+
]
168+
},
169+
"Saudi Driver's License": {
170+
"regex":"\b[0-9]{10}\b",
171+
"region":"Saudi Arabia",
172+
"keywords":[
173+
"Kingdom",
174+
"Saudi",
175+
"Arabia",
176+
"Ministry",
177+
"Interior",
178+
"Driving",
179+
"License"
180+
]
181+
},
182+
"Saudi Arabian Visa": {
183+
"regex":"(?:V<SAU)(?:[A-Z0-9<].+)",
184+
"region":"Saudi Arabia",
185+
"keywords":[
186+
"Visa",
187+
"Saudi Arabia",
188+
"V<SAU",
189+
"<<<<",
190+
"Entries",
191+
"Permitted",
192+
"Work",
193+
"Validity"
194+
]
195+
},
196+
"Tawuniya Health Insurance": {
197+
"regex":"\b[0-9]{5}\b",
198+
"region":"Saudi Arabia",
199+
"keywords":[
200+
"Tawuniya",
201+
"Policy",
202+
"Holder",
203+
"Number",
204+
"Deductible",
205+
"Approval"
206+
]
207+
},
154208
"Nebraska Driver's License": {
155209
"regex":"[A-Z]{1}[0-9]{9,11}",
156210
"region":"United States",

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ nltk
1717
bs4
1818
requests
1919
geotext
20+
spacy

text_utils.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,16 @@ def regional_pii(text):
9494
from nltk import word_tokenize, pos_tag, ne_chunk
9595
from nltk.corpus import stopwords
9696

97-
if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
98-
if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
99-
if not nltk.data.find('corpora/words.zip'): nltk.download('words')
97+
resources = ["punkt", "maxent_ne_chunker", "stopwords", "words", "averaged_perceptron_tagger"]
98+
99+
try:
100+
nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"]
101+
for resource in nltk_resources:
102+
if not nltk.data.find(resource): raise LookupError()
103+
except LookupError:
104+
for resource in resources:
105+
nltk.download(resource)
106+
100107
stop_words = set(stopwords.words('english'))
101108

102109
words = word_tokenize(text)

0 commit comments

Comments
 (0)