import pandas as pd10 Consor Text
10.1 Censor PII
def censor_pii(text) -> str:
import re
# Pattern for censoring age (e.g., 66-year-old)
text = re.sub(r'\b\d{1,3}-year-old\b', '[censored-age]', text, flags=re.IGNORECASE)
# Pattern for censoring gender (e.g., male, female, boy, girl)
text = re.sub(r'\b(man|male|woman|female|boy|girl)\b', '[censored-gender]', text, flags=re.IGNORECASE)
# Pattern for censoring doctor's name (e.g., Dr. Arnuparp)
text = re.sub(r'\bDr\.?\s+[A-Za-z]+\b', '[censored-dr-name]', text, flags=re.IGNORECASE)
# Pattern for censoring PCT number (e.g., PCT 48676)
text = re.sub(r'\bPCT\s*\d+\b', '[censored-tel]', text, flags=re.IGNORECASE)
return textExplanation:
- Age:
\b\d{1,3}-year-old\bmatches a number (1-3 digits) followed by-year-old, where\bensures word boundaries so it doesn’t match parts of other words. - Gender:
\b(male|female|boy|girl)\bmatches any of the specified gender-related words, using the word boundary\bto match complete words only. - Doctor’s Name:
\bDr\.?\s+[A-Za-z]+\bmatches “Dr.” (with or without a period) followed by a name (one or more alphabetic characters). - PCT Number:
\bPCT\s+\d+\bmatches “PCT” followed by one or more spaces and digits.
text = "The patient is a 66-year-old male. He was treated by Dr. Arnuparp. His PCT 48676 is recorded."
censored_text = censor_pii(text)
censored_text'The patient is a [censored-age] [censored-gender]. He was treated by [censored-dr-name]. His [censored-tel] is recorded.'
10.1.1 Multiple Texts
text_ls = [
"The 35-year-old female visited Dr. Smith. Her PCT 12345 was submitted.",
"A 12-year-old boy came in for a checkup. Dr. O'Neill, and his PCT98765 was updated.",
"Dr. Catherine gave a prescription to the 55-year-old male. The patient's PCT 56789 was noted in the records."
]
text_df = pd.DataFrame({"text": text_ls})
text_df| text | |
|---|---|
| 0 | The 35-year-old female visited Dr. Smith. Her ... |
| 1 | A 12-year-old boy came in for a checkup. Dr. O... |
| 2 | Dr. Catherine gave a prescription to the 55-ye... |
text_df_mod = (text_df
.pipe(lambda df: df.assign(text_censored = df["text"].map(censor_pii))))
text_df_mod| text | text_censored | |
|---|---|---|
| 0 | The 35-year-old female visited Dr. Smith. Her ... | The [censored-age] [censored-gender] visited [... |
| 1 | A 12-year-old boy came in for a checkup. Dr. O... | A [censored-age] [censored-gender] came in for... |
| 2 | Dr. Catherine gave a prescription to the 55-ye... | [censored-dr-name] gave a prescription to the ... |