Let's say I have a large pdf file containing a large database of emails and phone numbers. A task is given to me to find all the emails and the phone numbers and make a pretty looking text file with it.
First thing I do is copy all the content of the pdf file to the clipboard.
import re, pyperclip
To takle this problem, following steps are to be done.
- Create a regex for phone numbers
- Create a regex for email addresses
- Get the text off the clipboard
- Extract the email/phone from this text
- Copy the extracted email/phone to the clipboard
phone number can be like -
444-444-4444 | 444-4444 | (444) 444-44444 | 444-4444 ext 12345 | 444-4444 ext. 12345 | 444-4444 x 12345
phoneRegex = re.compile(r'''
(
((\d\d\d)|(\(\d\d\d\)))? #area code (optionsl)
(\s|-) #first separator
\d\d\d #first 3 digits
- #separator
\d\d\d\d #last 4 digits
(\s((ext(\.)?\s)|x) #extension word-part(optionsl)
(\d{2,5}))? #extension number-part (optional)
)
''', re.VERBOSE)
emailRegex = re.compile('''
[a-zA-Z0-9_.+]+ #name part. Inside of a character class [], the . is just a literal .
@ #@symbol
[a-zA-Z0-9_.+]+ #domain name part
''',re.VERBOSE)
text = 'small sample text 333-333-3333, (433) 433-2322, 333-3333 ext. 3232 and the email dlk_df2@fdsh.df fkd.fdsf@fhdsf>_+.fd'
extractedPhone = phoneRegex.findall(text) #Will give out list of tuple with first tuple element as the entire phone number
extractedEmail = emailRegex.findall(text)
allPhoneNumbers = []
for phoneNumber in extractedPhone: #will equal phoneNumber to all tuple elements of the list one by one
allPhoneNumbers.append(phoneNumber[0]) #phoneNumber[0] is used since the phoneNumber variable is equal to the tuple
#and the first element of the tuple is equal to the phone number because that
#is the first group encountered in the phone number regex
print(allPhoneNumbers)
for i in range(len(allPhoneNumbers)):
allPhoneNumbers[i] = allPhoneNumbers[i].strip()
print(allPhoneNumbers)
results = '\n'.join(allPhoneNumbers) + '\n' + '\n'.join(extractedEmail)
pyperclip.copy(results)
print(pyperclip.paste())