astro21 commited on
Commit
da7be98
1 Parent(s): 35236b5

Upload 3 files

Browse files
Files changed (3) hide show
  1. ResumeStructure.py +15 -0
  2. prompt_template.py +107 -0
  3. utils.py +119 -0
ResumeStructure.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Union
2
+ from langchain_core.pydantic_v1 import BaseModel, Field
3
+
4
+
5
+ class ResumeStructure(BaseModel):
6
+ education: List[Dict[str, str]] = Field(description="List of dictionaries containing 'university' and 'CGPA'")
7
+ work: List[Dict[str, Union[str, List[str]]]] = Field(description="List of dictionaries containing "
8
+ "'organization', 'location', 'position', "
9
+ "'duration', 'standardized_job_title', "
10
+ "and 'predicted_skills'")
11
+ projects: List[Dict[str, Union[str, List[str]]]] = Field(description="List of dictionaries containing "
12
+ "'project_name', 'start_date', 'end_date', "
13
+ "'description', and 'predicted_skills'")
14
+ skills: Dict[str, List[str]] = Field(description="Dictionary containing 'technical' and 'non_technical' skills")
15
+ career_trajectory: str = Field(description="String representing the career progression of the candidate")
prompt_template.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ template = """
2
+ For the following text, extract the following information:
3
+
4
+ Warning: Don't greet or write any introduction. Just start with the answer to the prompts. Do as per the instructions given in the prompt. If you don't know the answer, leave that part (keep blank) and move to the next part.
5
+
6
+ 1. Education: Extract the name of the all universities/colleges attended by the candidate with there CGPA.
7
+
8
+
9
+ 2. Work: Extract all organization names where he/she has worked along with the position held, and the duration of employment.
10
+ Predicted Skills : Also extract skills based on the work experience.
11
+ Standardized Job Title: Identify the standardized job title for each work experience.
12
+ Standardized Job Title: Identify the standardized job title for each work experience.Skills based on work experience
13
+
14
+ 3. Projects: Extract the details of the projects the candidate has worked on.
15
+ Predicted Skills : Also extract skills based on each project.
16
+
17
+
18
+ 4.Skills: Identify the technical and non-technical skills associated with each work experience and project.
19
+
20
+
21
+ 5.Career Trajectory: Identify the career progression of the candidate based on their work experience.
22
+
23
+ Output them in the following format:
24
+ Warning: if there is no data for any of the fields, leave it blank.
25
+
26
+ "Education: " and separate multiple entries with new line .
27
+
28
+ "Work: " Organization Name, Location, Position, Start Date - End Date 'and separate multiple entries with a comma.
29
+ "Job Title: " Identify the job title for each work experience. Clean and strip them off suffixes, prefixes and seniority.
30
+
31
+ " Predicted Skills : " and separate multiple entries with a comma for each work experience.
32
+ Note: Separate each work experience with a new line.
33
+ Warning: Don't print this text - "Organization Name, Location, Position, Start Date - End Date" as it is in the output .
34
+
35
+
36
+ "Project Name, Start Date - End Date, Project Description " and separate multiple entries with a comma and a new line for each project. (
37
+ " Predicted Skills : " and separate multiple entries with a comma for each project.
38
+ Note: Project Description should be in 30 to 40 words
39
+
40
+ Note: Separate each project with a new line.
41
+ Warning: Don't print "Project Name, Start Date - End Date, Project Description" as it is (text) in the output .
42
+
43
+ "Skills: " Skills under the skills section.
44
+ Classify them as technical and non-technical skills if possible.
45
+
46
+ "Career Trajectory: " and separate multiple entries with a -> . Career Trajectory should be in acsending order with respect to date of joining.
47
+ eg1 : "Data Analyst -> Data Scientist -> Senior Data Scientist"
48
+ eg2 : "School Name -> College Name -> University Name -> Job Title -> Job Title"
49
+
50
+ Resume: {text}
51
+
52
+ """
53
+
54
+ template_format_instructions = """
55
+ For the following text, extract the following information:
56
+
57
+ Warning: Don't greet or write any introduction. Just start with the answer to the prompts. Do as per the instructions given in the prompt. If you don't know the answer, leave that part (keep blank) and move to the next part.
58
+
59
+ 1. Education: Extract the name of the all universities/colleges attended by the candidate with there CGPA.
60
+
61
+
62
+ 2. Work: Extract all organization names where he/she has worked along with the position held, and the duration of employment.
63
+ Predicted Skills : Also extract skills based on the work experience.
64
+ Standardized Job Title: Identify the standardized job title for each work experience.
65
+ Standardized Job Title: Identify the standardized job title for each work experience.Skills based on work experience
66
+
67
+ 3. Projects: Extract the details of the projects the candidate has worked on.
68
+ Predicted Skills : Also extract skills based on each project.
69
+
70
+
71
+ 4.Skills: Identify the technical and non-technical skills associated with each work experience and project.
72
+
73
+
74
+ 5.Career Trajectory: Identify the career progression of the candidate based on their work experience.
75
+
76
+ Output them in the following format:
77
+ Warning: if there is no data for any of the fields, leave it blank.
78
+
79
+ "Education: " and separate multiple entries with new line .
80
+
81
+ "Work: " Organization Name, Location, Position, Start Date - End Date 'and separate multiple entries with a comma.
82
+ "Job Title: " Identify the job title for each work experience. Clean and strip them off suffixes, prefixes and seniority.
83
+
84
+ " Predicted Skills : " and separate multiple entries with a comma for each work experience.
85
+ Note: Separate each work experience with a new line.
86
+ Warning: Don't print this text - "Organization Name, Location, Position, Start Date - End Date" as it is in the output .
87
+
88
+
89
+ "Project Name, Start Date - End Date, Project Description " and separate multiple entries with a comma and a new line for each project. (
90
+ " Predicted Skills : " and separate multiple entries with a comma for each project.
91
+ Note: Project Description should be in 30 to 40 words
92
+
93
+ Note: Separate each project with a new line.
94
+ Warning: Don't print "Project Name, Start Date - End Date, Project Description" as it is (text) in the output .
95
+
96
+ "Skills: " Skills under the skills section.
97
+ Classify them as technical and non-technical skills if possible.
98
+
99
+ "Career Trajectory: " and separate multiple entries with a -> . Career Trajectory should be in ascending order with respect to date of joining.
100
+ eg1 : "Data Analyst -> Data Scientist -> Senior Data Scientist"
101
+ eg2 : "School Name -> College Name -> University Name -> Job Title -> Job Title"
102
+
103
+ Resume: {text}
104
+
105
+ \n{format_instructions}\n
106
+
107
+ """
utils.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from dedoc import DedocManager
4
+ from langchain.chat_models import ChatOpenAI
5
+ from langchain.prompts import PromptTemplate
6
+ from langchain_core.output_parsers import JsonOutputParser
7
+ from ResumeStructure import ResumeStructure
8
+ from fastapi import UploadFile
9
+ from prompt_template import template_format_instructions, template
10
+ from typing import List
11
+
12
+ # Create a directory to store temporary files
13
+ TEMP_DIR = "temp_files"
14
+ if not os.path.exists(TEMP_DIR):
15
+ os.makedirs(TEMP_DIR)
16
+
17
+
18
+ async def process_file_with_dedoc(file: UploadFile):
19
+ """
20
+ Process the file using Dedoc and return the output data.
21
+
22
+ Args:
23
+ - file: The UploadedFile object to be processed.
24
+
25
+ Returns:
26
+ - Output data if the file is processed successfully, None otherwise.
27
+ """
28
+ manager = DedocManager()
29
+
30
+ supported_formats = ['jpg', 'jpeg', 'png', 'docx', 'pdf', 'html', 'doc']
31
+
32
+ print(f"Processing file '{file.filename}'...")
33
+
34
+ # Save the uploaded file to a temporary directory
35
+ file_path = os.path.join(TEMP_DIR, file.filename)
36
+
37
+ with open(file_path, "wb") as buffer:
38
+ shutil.copyfileobj(file.file, buffer)
39
+
40
+ # Extract file extension from the file name
41
+ file_name, file_extension = os.path.splitext(file.filename)
42
+ file_extension = file_extension[1:].lower() # Remove the leading dot and convert to lowercase
43
+
44
+ # Check if the file extension is supported
45
+ if file_extension not in supported_formats:
46
+ print(f"Cannot process file '{file.filename}'. Unsupported file format.")
47
+ return None
48
+
49
+ # Process the file using Dedoc
50
+ output = manager.parse(file_path)
51
+ output_data = output.to_api_schema().model_dump()
52
+
53
+ # Remove the temporary file
54
+ os.remove(file_path)
55
+
56
+ return output_data
57
+
58
+
59
+ async def extract_text_from_all_levels(data):
60
+ """
61
+ Extract text from all levels of subparagraphs in the JSON data.
62
+
63
+ Args:
64
+ - data: The JSON data containing subparagraphs.
65
+
66
+ Returns:
67
+ - A string containing the text from all levels of subparagraphs.
68
+ """
69
+ text = ""
70
+
71
+ if 'subparagraphs' in data['content']['structure']:
72
+ subparagraphs = data['content']['structure']['subparagraphs']
73
+ text += await extract_text_from_subparagraphs(subparagraphs)
74
+ return text
75
+
76
+
77
+ async def extract_text_from_subparagraphs(subparagraphs):
78
+ """
79
+ Recursively extract text from subparagraphs.
80
+
81
+ Args:
82
+ - subparagraphs: A list of subparagraphs.
83
+
84
+ Returns:
85
+ - A string containing the text from all subparagraphs.
86
+ """
87
+ text = ""
88
+ for subpara in subparagraphs:
89
+ text += subpara['text'] + "\n"
90
+ if 'subparagraphs' in subpara:
91
+ text += await extract_text_from_subparagraphs(subpara['subparagraphs'])
92
+ return text
93
+
94
+
95
+ def generate_formatted_resume(resume, chat_llm):
96
+ prompt = PromptTemplate(
97
+ template=template,
98
+ input_variables=["text"],
99
+ )
100
+ chain = prompt | chat_llm
101
+
102
+ result = chain.invoke({"text": resume})
103
+
104
+ return result.content
105
+
106
+
107
+ def generate_json_structured_resume(resume, chat_llm):
108
+ parser = JsonOutputParser(pydantic_object=ResumeStructure)
109
+
110
+ prompt = PromptTemplate(
111
+ template=template_format_instructions,
112
+ input_variables=["text"],
113
+ partial_variables={"format_instructions": parser.get_format_instructions()}
114
+ )
115
+ chain = prompt | chat_llm | parser
116
+
117
+ result = chain.invoke({"text": resume})
118
+
119
+ return result