【问题标题】:Convert code into a function in R将代码转换为R中的函数
【发布时间】:2018-05-30 16:01:11
【问题描述】:

我想将一系列步骤转换为函数,因此我只需调用它们即可将其应用于数据帧。下面是一些 cmets 的代码:

library("textreadr")
library("pdftools")   
library("tidyverse")
library("tidytext")    
library("textreadr")
library("tm")

# Create Data frame
Off_let_data <- data.frame(page_id = c(3,3,3,3,3), element_id = c(19, 22, 26, 31, 31), 
                                 text = c("The Protected Percentage of your property value thats has been chosen is 0%", 
                                          "The Arrangement Fee payable at complettion: £50.00", 
                                          "The Fixed Interest Rate that is applied for the life of the period is: 5.40%", 
                                          "The Benchmark rate that will be used to calculate any early repayment 2.08%", 
                                          "The property value used in this scenario is 275,000.00"))

# read in the first element of a list of pdf file from a folder
files <- list.files(pattern = "pdf$")[1]

# extract the account number from the first pdf file
acc_num <- str_extract(files, "^\\d+")

# The RegEx's used to extract the relevant information
protec_per_reg <- "Protected\\sP\\w+\\sof"
Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"

# create a df that only includes the rows which match the above RegEx
Off_let <- Off_let_data %>% filter(page_id == 3, str_detect(Off_let_data$text, protec_per_reg)|
                                     str_detect(Off_let_data$text, Arr_Fee_reg) | str_detect(Off_let_data$text, Fix_inter_reg) | 
                                     str_detect(Off_let_data$text, Bench_rate_reg))

# Now only extract the numbers from the above DF
off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")

# The first element is always a NA value - based on the structure of these PDF files
# replace the first element of this character vector with the below
off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]] 
off_let_num

谁能帮我把它变成一个函数。谢谢

【问题讨论】:

    标签: r regex text rstudio


    【解决方案1】:

    这样的?

    函数的输入/输出应该是什么?目前,该函数仅将 data.frame 作为唯一参数,但您可以对其进行扩展,因此您可以传递不同的正则表达式,或者定义 page_id。

    library("textreadr")
    library("pdftools")   
    library("tidyverse")
    library("tidytext")    
    library("textreadr")
    library("tm")
    
    # Create Data frame
    Off_let_data <- data.frame(page_id = c(3,3,3,3,3), element_id = c(19, 22, 26, 31, 31), 
                               text = c("The Protected Percentage of your property value thats has been chosen is 0%", 
                                        "The Arrangement Fee payable at complettion: £50.00", 
                                        "The Fixed Interest Rate that is applied for the life of the period is: 5.40%", 
                                        "The Benchmark rate that will be used to calculate any early repayment 2.08%", 
                                        "The property value used in this scenario is 275,000.00"))
    
    dummyFunc <- function(df) {
      # read in the first element of a list of pdf file from a folder
      files <- list.files(pattern = "pdf$")[1]
    
      # extract the account number from the first pdf file
      acc_num <- str_extract(files, "^\\d+")
    
      # The RegEx's used to extract the relevant information
      protec_per_reg <- "Protected\\sP\\w+\\sof"
      Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
      Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
      Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
    
      # create a df that only includes the rows which match the above RegEx
      Off_let <- df %>% filter(page_id == 3, str_detect(df$text, protec_per_reg)|
                                           str_detect(df$text, Arr_Fee_reg) | str_detect(df$text, Fix_inter_reg) | 
                                           str_detect(df$text, Bench_rate_reg))
    
      # Now only extract the numbers from the above DF
      off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
    
      # The first element is always a NA value - based on the structure of these PDF files
      # replace the first element of this character vector with the below
      off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]] 
      return(off_let_num)
    }
    
    dummyFunc(Off_let_data)
    

    对于功能的更扩展版本:

    # The RegEx's used to extract the relevant information
    protec_per_reg <- "Protected\\sP\\w+\\sof"
    Arr_Fee_reg <- "^The\\sArrangement\\sF\\w+"
    Fix_inter_reg <- "Fixed\\sI\\w+\\sR\\w+"
    Bench_rate_reg <- "Benchmark\\sR\\w+\\sthat"
    
    regexprlist <- list(protec_per_reg, Arr_Fee_reg,
                        Fix_inter_reg, Bench_rate_reg)
    
    dummyFuncExt <- function(df, regexp, page_id) {
      # read in the first element of a list of pdf file from a folder
      files <- list.files(pattern = "pdf$")[1]
    
      # extract the account number from the first pdf file
      acc_num <- str_extract(files, "^\\d+")
    
      # create a df that only includes the rows which match the above RegEx
      Off_let <- df %>% filter(page_id == page_id, str_detect(df$text, regexprlist[[1]])|
                                 str_detect(df$text, regexprlist[[2]]) | str_detect(df$text, regexprlist[[3]]) | 
                                 str_detect(df$text, regexprlist[[4]]))
    
      # Now only extract the numbers from the above DF
      off_let_num <- str_extract(Off_let$text, "\\d+\\.?\\d+")
    
      # The first element is always a NA value - based on the structure of these PDF files
      # replace the first element of this character vector with the below
      off_let_num[is.na(off_let_num)] <- str_extract(Off_let$text, "\\d+%")[[1]] 
      return(off_let_num)
    }
    
    dummyFuncExt(df = Off_let_data, regexp = regexprlist, page_id = 3)
    

    【讨论】:

    • 是这样的。基本上,我有超过 200 个 pdf 文档,我想从中提取某些信息,然后将所有这些信息组合成一个数据框。上面的代码一次处理一个 pdf 文件,并在变量“off_let_num”中为我提供了相关信息。现在我想把这段代码变成一个函数,这样我就可以应用到一个文件夹中的所有 pdf 文件,并将它们的结果聚合到一个对象中,最后我将转换为一个数据框。
    • 明天我会在pdf文件上检查这个功能。感谢您的帮助
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2012-04-25
    • 2018-11-05
    • 1970-01-01
    • 2013-03-02
    • 2015-09-05
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多