bugfix> r > 投稿

各グループの欠損値を個別に置き換えるにはどうすればよいですか?

再現可能な例:

mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group.2 = c(1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L, 
NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA, 
49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L, 
NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA, 
-22L))

今、グループなしで欠損値を置き換える方法を見つけました。

library(dplyr)
mydata %>% mutate_at(vars(starts_with("x1")), funs(ifelse(is.na(.) & is.numeric(.) ,mean(., na.rm = TRUE),.)))

しかし、私は各グループ(group1、group2)を個別に交換する必要があります。

小さなデータセットに編集する
structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L, 2L), group.2 = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 
2L, 2L, 2L), x1 = c(63L, 67L, 57L, NA, 65L, 75L, 57L, 80L, 42L, 
NA, 35L, 80L), x2 = c(46L, 1L, NA, 41L, 80L, NA, 74L, 73L, NA, 
13L, 83L, NA)), class = "data.frame", row.names = c(NA, -12L))

回答 1 件
  • mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group2 = c(1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L, 
    NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA, 
    49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L, 
    NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA, 
    -22L))
    
    library(tidyverse)
    mydata %>%
      unite(group, group1, group2) %>%        # combine groups
      mutate(id = row_number()) %>%           # add the row number as an id (useful when reshaping)
      gather(var, value, -group, -id) %>%     # reshape data
      group_by(group, var) %>%                # for each group combination and variable
      mutate(value = ifelse(is.na(value), mean(value, na.rm = T), value)) %>%   # replace NAs with mean
      spread(var, value) %>%                  # reshape again
      arrange(id) %>%                         # keep order of original dataset
      select(-id) %>%                         # remove id
      ungroup() %>%                           # forget the grouping
      separate(group, c("group1","group2"))   # split the groups again
    # # A tibble: 22 x 4
    #   group1 group2    x1    x2
    #   <chr>  <chr>  <dbl> <dbl>
    # 1 1      1       20    44  
    # 2 1      2        4    94  
    # 3 1      1       91    61.3
    # 4 1      2       36.5   1  
    # 5 1      1       94    67  
    # 6 1      2       69    39  
    # 7 1      1       38    73  
    # 8 1      2       36.5  22  
    # 9 2      1       29    44  
    # 10 2      2       69    24  
    # # ... with 12 more rows
    
    

あなたの答え