import pandas as pd
10 Consor Text
10.1 Censor PII
def censor_pii(text) -> str:
import re
# Pattern for censoring age (e.g., 66-year-old)
= re.sub(r'\b\d{1,3}-year-old\b', '[censored-age]', text, flags=re.IGNORECASE)
text
# Pattern for censoring gender (e.g., male, female, boy, girl)
= re.sub(r'\b(man|male|woman|female|boy|girl)\b', '[censored-gender]', text, flags=re.IGNORECASE)
text
# Pattern for censoring doctor's name (e.g., Dr. Arnuparp)
= re.sub(r'\bDr\.?\s+[A-Za-z]+\b', '[censored-dr-name]', text, flags=re.IGNORECASE)
text
# Pattern for censoring PCT number (e.g., PCT 48676)
= re.sub(r'\bPCT\s*\d+\b', '[censored-tel]', text, flags=re.IGNORECASE)
text
return text
Explanation:
- Age:
\b\d{1,3}-year-old\b
matches a number (1-3 digits) followed by-year-old
, where\b
ensures word boundaries so it doesn’t match parts of other words. - Gender:
\b(male|female|boy|girl)\b
matches any of the specified gender-related words, using the word boundary\b
to match complete words only. - Doctor’s Name:
\bDr\.?\s+[A-Za-z]+\b
matches “Dr.” (with or without a period) followed by a name (one or more alphabetic characters). - PCT Number:
\bPCT\s+\d+\b
matches “PCT” followed by one or more spaces and digits.
= "The patient is a 66-year-old male. He was treated by Dr. Arnuparp. His PCT 48676 is recorded."
text = censor_pii(text)
censored_text censored_text
'The patient is a [censored-age] [censored-gender]. He was treated by [censored-dr-name]. His [censored-tel] is recorded.'
10.1.1 Multiple Texts
= [
text_ls "The 35-year-old female visited Dr. Smith. Her PCT 12345 was submitted.",
"A 12-year-old boy came in for a checkup. Dr. O'Neill, and his PCT98765 was updated.",
"Dr. Catherine gave a prescription to the 55-year-old male. The patient's PCT 56789 was noted in the records."
]
= pd.DataFrame({"text": text_ls})
text_df text_df
text | |
---|---|
0 | The 35-year-old female visited Dr. Smith. Her ... |
1 | A 12-year-old boy came in for a checkup. Dr. O... |
2 | Dr. Catherine gave a prescription to the 55-ye... |
= (text_df
text_df_mod lambda df: df.assign(text_censored = df["text"].map(censor_pii))))
.pipe(
text_df_mod
text | text_censored | |
---|---|---|
0 | The 35-year-old female visited Dr. Smith. Her ... | The [censored-age] [censored-gender] visited [... |
1 | A 12-year-old boy came in for a checkup. Dr. O... | A [censored-age] [censored-gender] came in for... |
2 | Dr. Catherine gave a prescription to the 55-ye... | [censored-dr-name] gave a prescription to the ... |