In this guide, I will show you how to extract text from pdf files. LinkedIn created a short time ago a guide on how to write successful job descriptions. Click here to download their pdf.
setwd("~/Documents/job descriptions/") library(officer) library(dplyr) library(stringr) library(tidyr) library(stringi) path = "23 Job descriptions/" #folder in which I downloaded the 23 job descriptions data<-list() # start with an empty list file.names <- dir(path, pattern =".docx") # run through your loop, adding each vector to the list for(i in 1:length(file.names)){ file <- read_docx(paste0("23 Job descriptions/",file.names[i])) content <- docx_summary(file) r <- paste(content[1,4]) s <- paste(content[2,4]) combined <- paste0(content$text, collapse=",") # result of iteration of loop added to combined data frame t <- as.data.frame(regmatches(combined, regexpr("OVERALL SUMMARY,\\s*\\K.*?(?=\\s*RESPONSIBILITIES)", combined, perl=TRUE))) x <- as.data.frame(regmatches(combined, regexpr("RESPONSIBILITIES,\\s*\\K.*?(?=\\s*,REQUIREMENTS)", combined, perl=TRUE))) y <- as.data.frame(regmatches(combined, regexpr("REQUIREMENTS,\\s*\\K.*?(?=\\s*,ABOUT)", combined, perl=TRUE))) data <- rbind(data, data.frame(r, s, t, x, y)) } colnames(data) <- c("Role","Job description title", "Overall summary", "Responsibilities", "Requirements") data$Role <- stri_trans_totitle(data$Role) write.csv(data, file = "job decriptions data.csv",row.names=FALSE)