【问题标题】:Read data set into well formated table with pre-specified number of columns将数据集读入具有预先指定列数的格式良好的表中
【发布时间】:2018-09-17 07:36:27
【问题描述】:

我有一个这样的txt.file

0003    MPARTNER  SALZ          S                           150112 22:30:45  160304 08:38:13  2      BUY                          2  BUY                  12380    165426  150109 08:00:00
0003    SPROTTSE  HUGHES        S                           140407 02:30:50  141120 13:55:06  2      BUY                          2  BUY                   3764     57379  140401 10:05:00
0003    SPROTTSE  HUGHES        S                           141223 09:06:13  160715 08:42:56  3      MARKETPERFORM                3  HOLD                  3764     57379  141223 08:02:00
001V    MPARTNER  PEARLSTEIN    D                           140821 02:44:05  150312 09:17:13  2      BUY                          2  BUY                  12380    163717  140820 08:16:00
001V    MPARTNER  PEARLSTEIN    D                           151016 15:07:40  160411 08:40:35  2      BUY                          2  BUY                  12380    163717  151009 08:12:00
001W    CANACCOR                K                           140321 04:06:40  140609 23:06:44         SPECULATIVE BUY              1  STRONG BUY             406    150412  140319 23:19:00
001W    CANACCOR  WRIGHT        K                           140714 12:47:31  160228 22:57:45         BUY                          1  STRONG BUY             406    150412  140714 12:38:00
001W    CLARUS    OFIR          E                           140515 11:40:00  150515 09:27:09         SPECULATIVE BUY              1  STRONG BUY             202    115944  140515 11:40:00
001W    CLARUS    MACKAY        D                           150813 09:40:45  160812 09:40:02         BUY                          1  STRONG BUY             202     73763  150813 09:23:00
001W    DEACON    OFIR          E                           150119 22:03:46  170328 06:45:14  1      BUY                          1  STRONG BUY             704    115944  150112 07:24:00
001W    DEACON    OFIR          E                           171115 06:48:47  171115 06:48:47  1      BUY                          1  STRONG BUY             704    115944  171115 06:42:00
@70L    MORGAN    MARTINEZ      J                           100226 07:12:51  100708 04:51:16  8      EQUALWT/NO RATING            3  HOLD                  1595     56947  100226 07:12:00
@70L    MORGAN    MARTINEZ DE O J                           100708 05:09:02  100910 00:48:28  6      EQUALWT/IN-LINE              3  HOLD                  1595     56947  100708 03:14:00
@70L    MORGAN    MARTINEZ DE O J                           100910 21:16:07  101110 21:55:52  2      OVERWT/IN-LINE               2  BUY                   1595     56947  100910 19:18:00
@70L    MORGAN    OLCOZ CERDAN  J                           101112 01:32:41  120618 21:04:56  2      OVERWT/IN-LINE               2  BUY                   1595     56947  101111 20:03:00
@70L    MORGAN    OLCOZ CERDAN  J                           120712 03:19:26  131216 19:49:59  6      EQUALWT/IN-LINE              3  HOLD                  1595     56947  120711 19:20:00
@70L    MORGAN    OLCOZ CERDAN  J                           140226 22:20:19  150417 13:07:31  2      OVERWT/IN-LINE               2  BUY                   1595     56947  140226 22:20:00
@70L    MORGAN                  J                           150608 01:25:35  171106 00:16:05  1      OVERWT/ATTRACTIVE            2  BUY                   1595     56947  150608 01:25:00

我想在R 中生成一个表,其结构与txt 文件中的表观16 列 具有相同的结构

我尝试使用代码:

max(count.fields("BSP.txt", sep="")) # 18 columns
df= read.delim("BSP.txt", sep = "" ,header = FALSE,col.names = c("V1", "VS","V3", "V4", "V5","V6",
                                                                 "V7", "V8", "V9", "V10", 
                                                                 "V11", "V12", "V13", "V14",
                                                                 "V15","V16","V17","V18"))

但我收到了一个结构怪异的表格:

structure(list(V1 = structure(c(2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 
4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("@70L", "0003", 
"001V", "001W"), class = "factor"), VS = structure(c(5L, 6L, 
6L, 5L, 5L, 1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L
), .Label = c("CANACCOR", "CLARUS", "DEACON", "MORGAN", "MPARTNER", 
"SPROTTSE"), class = "factor"), V3 = structure(c(9L, 1L, 1L, 
8L, 8L, 3L, 10L, 6L, 4L, 6L, 6L, 5L, 5L, 5L, 7L, 7L, 7L, 2L), .Label = c("HUGHES", 
"J", "K", "MACKAY", "MARTINEZ", "OFIR", "OLCOZ", "PEARLSTEIN", 
"SALZ", "WRIGHT"), class = "factor"), V4 = structure(c(9L, 9L, 
9L, 4L, 4L, 1L, 8L, 6L, 4L, 6L, 6L, 7L, 5L, 5L, 3L, 3L, 3L, 2L
), .Label = c("140321", "150608", "CERDAN", "D", "DE", "E", "J", 
"K", "S"), class = "factor"), V5 = structure(c(9L, 4L, 8L, 7L, 
12L, 2L, 6L, 5L, 11L, 10L, 13L, 3L, 15L, 15L, 14L, 14L, 14L, 
1L), .Label = c("01:25:35", "04:06:40", "100226", "140407", "140515", 
"140714", "140821", "141223", "150112", "150119", "150813", "151016", 
"171115", "J", "O"), class = "factor"), V6 = structure(c(16L, 
1L, 5L, 2L, 13L, 12L, 9L, 8L, 6L, 15L, 3L, 4L, 17L, 17L, 7L, 
10L, 11L, 14L), .Label = c("02:30:50", "02:44:05", "06:48:47", 
"07:12:51", "09:06:13", "09:40:45", "101112", "11:40:00", "12:47:31", 
"120712", "140226", "140609", "15:07:40", "171106", "22:03:46", 
"22:30:45", "J"), class = "factor"), V7 = structure(c(10L, 6L, 
12L, 7L, 11L, 17L, 9L, 8L, 13L, 14L, 15L, 4L, 4L, 5L, 2L, 3L, 
16L, 1L), .Label = c("00:16:05", "01:32:41", "03:19:26", "100708", 
"100910", "141120", "150312", "150515", "160228", "160304", "160411", 
"160715", "160812", "170328", "171115", "22:20:19", "23:06:44"
), class = "factor"), V8 = structure(c(5L, 13L, 7L, 8L, 6L, 18L, 
17L, 9L, 10L, 3L, 4L, 1L, 2L, 16L, 12L, 14L, 15L, 11L), .Label = c("04:51:16", 
"05:09:02", "06:45:14", "06:48:47", "08:38:13", "08:40:35", "08:42:56", 
"09:17:13", "09:27:09", "09:40:02", "1", "120618", "13:55:06", 
"131216", "150417", "21:16:07", "22:57:45", "SPECULATIVE"), class = "factor"), 
    V9 = structure(c(6L, 6L, 8L, 6L, 6L, 10L, 10L, 12L, 10L, 
    1L, 1L, 9L, 2L, 3L, 7L, 5L, 4L, 11L), .Label = c("1", "100910", 
    "101110", "13:07:31", "19:49:59", "2", "21:04:56", "3", "8", 
    "BUY", "OVERWT/ATTRACTIVE", "SPECULATIVE"), class = "factor"), 
    V10 = structure(c(6L, 6L, 8L, 6L, 6L, 2L, 2L, 6L, 2L, 6L, 
    6L, 7L, 1L, 4L, 3L, 5L, 3L, 3L), .Label = c("00:48:28", "1", 
    "2", "21:55:52", "6", "BUY", "EQUALWT/NO", "MARKETPERFORM"
    ), class = "factor"), V11 = structure(c(2L, 2L, 3L, 2L, 2L, 
    9L, 9L, 1L, 9L, 1L, 1L, 8L, 4L, 2L, 7L, 6L, 7L, 5L), .Label = c("1", 
    "2", "3", "6", "BUY", "EQUALWT/IN-LINE", "OVERWT/IN-LINE", 
    "RATING", "STRONG"), class = "factor"), V12 = structure(c(4L, 
    4L, 6L, 4L, 4L, 4L, 4L, 8L, 4L, 8L, 8L, 3L, 5L, 7L, 2L, 3L, 
    2L, 1L), .Label = c("1595", "2", "3", "BUY", "EQUALWT/IN-LINE", 
    "HOLD", "OVERWT/IN-LINE", "STRONG"), class = "factor"), V13 = structure(c(1L, 
    5L, 5L, 1L, 1L, 6L, 6L, 8L, 3L, 8L, 8L, 9L, 4L, 2L, 8L, 9L, 
    8L, 7L), .Label = c("12380", "2", "202", "3", "3764", "406", 
    "56947", "BUY", "HOLD"), class = "factor"), V14 = structure(c(5L, 
    7L, 7L, 4L, 4L, 1L, 1L, 6L, 9L, 8L, 8L, 3L, 11L, 10L, 3L, 
    3L, 3L, 2L), .Label = c("150412", "150608", "1595", "163717", 
    "165426", "202", "57379", "704", "73763", "BUY", "HOLD"), class = "factor"), 
    V15 = structure(c(8L, 4L, 7L, 6L, 10L, 3L, 5L, 2L, 9L, 2L, 
    2L, 12L, 11L, 11L, 12L, 12L, 12L, 1L), .Label = c("01:25:00", 
    "115944", "140319", "140401", "140714", "140820", "141223", 
    "150109", "150813", "151009", "1595", "56947"), class = "factor"), 
    V16 = structure(c(2L, 7L, 3L, 5L, 4L, 16L, 10L, 13L, 6L, 
    14L, 15L, 8L, 17L, 17L, 9L, 11L, 12L, 1L), .Label = c("", 
    "08:00:00", "08:02:00", "08:12:00", "08:16:00", "09:23:00", 
    "10:05:00", "100226", "101111", "12:38:00", "120711", "140226", 
    "140515", "150112", "171115", "23:19:00", "56947"), class = "factor"), 
    V17 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 7L, 1L, 4L, 
    2L, 3L, 5L, 6L, 9L, 8L, 10L, 1L), .Label = c("", "06:42:00", 
    "07:12:00", "07:24:00", "100708", "100910", "11:40:00", "19:20:00", 
    "20:03:00", "22:20:00"), class = "factor"), V18 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 
    1L, 1L), .Label = c("", "03:14:00", "19:18:00"), class = "factor")), .Names = c("V1", 
"VS", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11", 
"V12", "V13", "V14", "V15", "V16", "V17", "V18"), class = "data.frame", row.names = c(NA, 
-18L))

如上所述,我希望收到一个包含 16 列 的表格,其结构为 txt.file。即使是空字段(例如第 6 行)也应该保留。

例如第 6 行:

你能帮我解决这个问题吗? 非常感谢。

【问题讨论】:

  • 你的问题从第 13 行开始,因为这个人的名字包含被认为是分隔符的空格。

标签: r read.table


【解决方案1】:

一种选择是使用read.fwf

df <- read.fwf("tst.txt", widths = c(8, 10, 14, 28, 7, 10, 7, 10, 7, 29, 3,
     21, 9, 8, 7, 8), header = FALSE)

#Now next part will be to remove the leading/training whitespaces from character fields. 
library(dplyr)
df <- df %>% mutate_if(is.factor, function(x)trimws(as.character(x)))

数据框如下:

df
#      V1       V2            V3 V4     V5       V6     V7       V8 V9               V10 V11        V12   V13    V14    V15      V16
# 1  0003 MPARTNER          SALZ  S 150112 22:30:45 160304 08:38:13  2               BUY   2        BUY 12380 165426 150109 08:00:00
# 2  0003 SPROTTSE        HUGHES  S 140407 02:30:50 141120 13:55:06  2               BUY   2        BUY  3764  57379 140401 10:05:00
# 3  0003 SPROTTSE        HUGHES  S 141223 09:06:13 160715 08:42:56  3     MARKETPERFORM   3       HOLD  3764  57379 141223 08:02:00
# 4  001V MPARTNER    PEARLSTEIN  D 140821 02:44:05 150312 09:17:13  2               BUY   2        BUY 12380 163717 140820 08:16:00
# 5  001V MPARTNER    PEARLSTEIN  D 151016 15:07:40 160411 08:40:35  2               BUY   2        BUY 12380 163717 151009 08:12:00
# 6  001W CANACCOR                K 140321 04:06:40 140609 23:06:44 NA   SPECULATIVE BUY   1 STRONG BUY   406 150412 140319 23:19:00
# 7  001W CANACCOR        WRIGHT  K 140714 12:47:31 160228 22:57:45 NA               BUY   1 STRONG BUY   406 150412 140714 12:38:00
# 8  001W   CLARUS          OFIR  E 140515 11:40:00 150515 09:27:09 NA   SPECULATIVE BUY   1 STRONG BUY   202 115944 140515 11:40:00
# 9  001W   CLARUS        MACKAY  D 150813 09:40:45 160812 09:40:02 NA               BUY   1 STRONG BUY   202  73763 150813 09:23:00
# 10 001W   DEACON          OFIR  E 150119 22:03:46 170328 06:45:14  1               BUY   1 STRONG BUY   704 115944 150112 07:24:00
# 11 001W   DEACON          OFIR  E 171115 06:48:47 171115 06:48:47  1               BUY   1 STRONG BUY   704 115944 171115 06:42:00
# 12 @70L   MORGAN      MARTINEZ  J 100226 07:12:51 100708 04:51:16  8 EQUALWT/NO RATING   3       HOLD  1595  56947 100226 07:12:00
# 13 @70L   MORGAN MARTINEZ DE O  J 100708 05:09:02 100910 00:48:28  6   EQUALWT/IN-LINE   3       HOLD  1595  56947 100708 03:14:00
# 14 @70L   MORGAN MARTINEZ DE O  J 100910 21:16:07 101110 21:55:52  2    OVERWT/IN-LINE   2        BUY  1595  56947 100910 19:18:00
# 15 @70L   MORGAN  OLCOZ CERDAN  J 101112 01:32:41 120618 21:04:56  2    OVERWT/IN-LINE   2        BUY  1595  56947 101111 20:03:00
# 16 @70L   MORGAN  OLCOZ CERDAN  J 120712 03:19:26 131216 19:49:59  6   EQUALWT/IN-LINE   3       HOLD  1595  56947 120711 19:20:00
# 17 @70L   MORGAN  OLCOZ CERDAN  J 140226 22:20:19 150417 13:07:31  2    OVERWT/IN-LINE   2        BUY  1595  56947 140226 22:20:00
# 18 @70L   MORGAN                J 150608 01:25:35 171106 00:16:05  1 OVERWT/ATTRACTIVE   2        BUY  1595  56947 150608 01:25:00

上面的 data.frame 有 16 列和 18 行。

【讨论】:

  • 是的,但问题是,例如对于第 6 行,列中的值分配不是我想要的。 V3 列应为空,并且 V9 应结合 tha 值“投机性购买”。看我编辑的帖子。谢谢
  • 是的。这就是我现在所意识到的。它增加了行数。
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2023-01-19
  • 1970-01-01
相关资源
最近更新 更多