10 Consor Text

import pandas as pd

10.1 Censor PII

def censor_pii(text) -> str:
    import re
    # Pattern for censoring age (e.g., 66-year-old)
    text = re.sub(r'\b\d{1,3}-year-old\b', '[censored-age]', text, flags=re.IGNORECASE)
    
    # Pattern for censoring gender (e.g., male, female, boy, girl)
    text = re.sub(r'\b(man|male|woman|female|boy|girl)\b', '[censored-gender]', text, flags=re.IGNORECASE)
    
    # Pattern for censoring doctor's name (e.g., Dr. Arnuparp)
    text = re.sub(r'\bDr\.?\s+[A-Za-z]+\b', '[censored-dr-name]', text, flags=re.IGNORECASE)
    
    # Pattern for censoring PCT number (e.g., PCT 48676)
    text = re.sub(r'\bPCT\s*\d+\b', '[censored-tel]', text, flags=re.IGNORECASE)
    
    return text

Explanation:

Age: \b\d{1,3}-year-old\b matches a number (1-3 digits) followed by -year-old, where \b ensures word boundaries so it doesn’t match parts of other words.
Gender: \b(male|female|boy|girl)\b matches any of the specified gender-related words, using the word boundary \b to match complete words only.
Doctor’s Name: \bDr\.?\s+[A-Za-z]+\b matches “Dr.” (with or without a period) followed by a name (one or more alphabetic characters).
PCT Number: \bPCT\s+\d+\b matches “PCT” followed by one or more spaces and digits.

text = "The patient is a 66-year-old male. He was treated by Dr. Arnuparp. His PCT 48676 is recorded."
censored_text = censor_pii(text)
censored_text

'The patient is a [censored-age] [censored-gender]. He was treated by [censored-dr-name]. His [censored-tel] is recorded.'

10.1.1 Multiple Texts

text_ls  = [
    "The 35-year-old female visited Dr. Smith. Her PCT 12345 was submitted.",
    "A 12-year-old boy came in for a checkup. Dr. O'Neill, and his PCT98765 was updated.",
    "Dr. Catherine gave a prescription to the 55-year-old male. The patient's PCT 56789 was noted in the records."
]

text_df = pd.DataFrame({"text": text_ls})
text_df

	text
0	The 35-year-old female visited Dr. Smith. Her ...
1	A 12-year-old boy came in for a checkup. Dr. O...
2	Dr. Catherine gave a prescription to the 55-ye...

text_df_mod = (text_df
 .pipe(lambda df: df.assign(text_censored = df["text"].map(censor_pii))))

text_df_mod

	text	text_censored
0	The 35-year-old female visited Dr. Smith. Her ...	The [censored-age] [censored-gender] visited [...
1	A 12-year-old boy came in for a checkup. Dr. O...	A [censored-age] [censored-gender] came in for...
2	Dr. Catherine gave a prescription to the 55-ye...	[censored-dr-name] gave a prescription to the ...