Using Python to search for key words in text files

As I have previously blogged, Python is very useful in solving everyday tasks that you may encounter. This week, I was tasked with reviewing resumes of potential candidates. As some of the resumes were unexpectedly long (8 pages in one instance), I came up with a list of search terms and then used the bellow python code to identify the presence of these terms in the resumes. The script generates a skills matrix of candidate name and skill as a csv that can be opened in Excel and filtered as needed. Additionally, it counts the number of search terms found in each resume to rank candidates.

This process does not in any way replace reading the resumes. It is just a convenient way to generate a cross reference of skills and candidates.

import os
import string

searchstrings = ('install', 'patch', 'upgrade', 'migrate', 'shell', 'scripts', 'rac', 'asm', 'performance', 'data guard', 'golden gate', 'import', 'export', 'pump', 'loader', 'rman', 'recovery', 'tde', 'db2', 'mysql', 'sybase', 'mongo', 'teradata', 'postgres', 'postgresql', 'casandra', 'mssql', 'aws', 'jenkins', 'json', 'cloud', 'oci')
src_dict = ("C:/temp/Resumes/temp/") #Specify base directory
 
report_file_name = 'C:/temp/Resumes/temp.txt'
report_file = open(report_file_name, 'w')
#temp = 'Name,Skill' + '\n'
#report_file.write(temp)
                
for resumes in os.listdir(src_dict):
    #print (resumes, 'contains the below terms')
    files = os.path.join(src_dict, resumes)
    #print (files)
    strng = open(files)
    for line in strng.readlines():
        #print (line)
        for word in searchstrings:
            if  word in line.lower():
                #print ('    Found', word, 'at line-->', line.rstrip())
                temp = resumes + ',' + word + '\n'
                #print (resumes,',',word)
                report_file.write(temp)
                
report_file.close()

#
## Sort the data to remove the duplicates
#
duplicates_file = open(report_file_name, 'r').readlines()
content_set = sorted(set(duplicates_file))

unique_file_name = 'C:/temp/Resumes/report.csv'
unique_file = open(unique_file_name, 'w')
for line in content_set:
    unique_file.write(line)

unique_file.close()

#
## Count the number of skills that each person has
#
unique_file_name = 'C:/temp/Resumes/report.csv'
with open('C:/temp/Resumes/report.csv') as unique_file:
    line = unique_file.readline()
fields = line.split(",")
prev_name = fields[0]
unique_file.close()
#print (prev_name)
skill_ctr = 0

unique_file = open(unique_file_name, 'r')
for line in unique_file:
    fields = line.split(",")
    curr_name = fields[0]
    if  curr_name == prev_name:
        skill_ctr = skill_ctr + 1
    else:
        temp = prev_name + ' has ' + str(skill_ctr) + ' skills.'
        print (temp)
        prev_name = curr_name
        skill_ctr = 0
temp = curr_name + ' has ' + str(skill_ctr) + ' skills.'
print (temp)

unique_file.close()