Merge pull request #28 from redhuntlabs/saudi-pii-pack

0x4f53 · web-flow · commit 9777b171bc24 · 2023-11-23T20:37:06.000+05:30
Saudi pii pack
diff --git a/.github/workflows/github-action.yml b/.github/workflows/github-action.yml
diff --git a/definitions.json b/definitions.json
@@ -151,6 +151,60 @@
          "<<<<"
       ]
    },
+   "Resident Identity (Iqama)": {
+      "regex":null,
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Kingdom",
+         "Saudi",
+         "Arabia",
+         "Ministry",
+         "Interior",
+         "Permit",
+         "Iqama",
+         "Residen",
+         "Identity"
+      ]
+   },
+   "Saudi Driver's License": {
+      "regex":"\b[0-9]{10}\b",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Kingdom",
+         "Saudi",
+         "Arabia",
+         "Ministry",
+         "Interior",
+         "Driving",
+         "License"
+      ]
+   },
+   "Saudi Arabian Visa": {
+      "regex":"(?:V<SAU)(?:[A-Z0-9<].+)",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Visa",
+         "Saudi Arabia",
+         "V<SAU",
+         "<<<<",
+         "Entries",
+         "Permitted",
+         "Work",
+         "Validity"
+      ]
+   },
+   "Tawuniya Health Insurance": {
+      "regex":"\b[0-9]{5}\b",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Tawuniya",
+         "Policy",
+         "Holder",
+         "Number",
+         "Deductible",
+         "Approval"
+      ]
+   },
    "Nebraska Driver's License": {
       "regex":"[A-Z]{1}[0-9]{9,11}",
       "region":"United States",
diff --git a/requirements.txt b/requirements.txt
@@ -17,3 +17,4 @@ nltk
 bs4
 requests
 geotext
+spacy
diff --git a/text_utils.py b/text_utils.py
@@ -94,9 +94,16 @@ def regional_pii(text):
     from nltk import word_tokenize, pos_tag, ne_chunk
     from nltk.corpus import stopwords
 
-    if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
-    if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
-    if not nltk.data.find('corpora/words.zip'): nltk.download('words')
+    resources = ["punkt", "maxent_ne_chunker", "stopwords", "words", "averaged_perceptron_tagger"]
+
+    try:
+        nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"]
+        for resource in nltk_resources:
+            if not nltk.data.find(resource): raise LookupError()
+    except LookupError:
+        for resource in resources:
+            nltk.download(resource)
+
     stop_words = set(stopwords.words('english'))
 
     words = word_tokenize(text)

-Original file line number
+Diff line change
 bs4
 requests
 geotext
 +spacy