Programming

그룹화 된 데이터에서 첫 번째 및 마지막 행을 선택하십시오.

procodes 2020. 7. 16. 21:18
반응형

그룹화 된 데이터에서 첫 번째 및 마지막 행을 선택하십시오.


질문

를 사용하여 dplyr그룹화 된 데이터의 상단 및 하단 관찰 / 행을 하나의 문에서 어떻게 선택합니까?

데이터 및 예

주어진 데이터 프레임

df <- data.frame(id=c(1,1,1,2,2,2,3,3,3), 
                 stopId=c("a","b","c","a","b","c","a","b","c"), 
                 stopSequence=c(1,2,3,3,1,4,3,1,2))

을 사용 slice하지만 두 가지 별도의 통계를 사용 하여 각 그룹에서 상단 및 하단 관측치를 얻을 수 있습니다 .

firstStop <- df %>%
  group_by(id) %>%
  arrange(stopSequence) %>%
  slice(1) %>%
  ungroup

lastStop <- df %>%
  group_by(id) %>%
  arrange(stopSequence) %>%
  slice(n()) %>%
  ungroup

이 두 가지 통계를 상위 및 하위 관측치를 모두 선택 하는 통계로 결합 할 수 있습니까 ?


아마도 더 빠른 방법이있을 것입니다 :

df %>%
  group_by(id) %>%
  arrange(stopSequence) %>%
  filter(row_number()==1 | row_number()==n())

완전성을 위해 : slice인덱스 벡터를 전달할 수 있습니다 .

df %>% arrange(stopSequence) %>% group_by(id) %>% slice(c(1,n()))

어느 것이

  id stopId stopSequence
1  1      a            1
2  1      c            3
3  2      b            1
4  2      c            4
5  3      b            1
6  3      a            3

아니 dplyr,하지만 훨씬 더 사용하여 직접입니다 data.table:

library(data.table)
setDT(df)
df[ df[order(id, stopSequence), .I[c(1L,.N)], by=id]$V1 ]
#    id stopId stopSequence
# 1:  1      a            1
# 2:  1      c            3
# 3:  2      b            1
# 4:  2      c            4
# 5:  3      b            1
# 6:  3      a            3

더 자세한 설명 :

# 1) get row numbers of first/last observations from each group
#    * basically, we sort the table by id/stopSequence, then,
#      grouping by id, name the row numbers of the first/last
#      observations for each id; since this operation produces
#      a data.table
#    * .I is data.table shorthand for the row number
#    * here, to be maximally explicit, I've named the variable V1
#      as row_num to give other readers of my code a clearer
#      understanding of what operation is producing what variable
first_last = df[order(id, stopSequence), .(row_num = .I[c(1L,.N)]), by=id]
idx = first_last$row_num

# 2) extract rows by number
df[idx]

기본 사항을 다루는 시작 위키 를 확인하십시오.data.table


다음과 같은 것 :

library(dplyr)

df <- data.frame(id=c(1,1,1,2,2,2,3,3,3),
                 stopId=c("a","b","c","a","b","c","a","b","c"),
                 stopSequence=c(1,2,3,3,1,4,3,1,2))

first_last <- function(x) {
  bind_rows(slice(x, 1), slice(x, n()))
}

df %>%
  group_by(id) %>%
  arrange(stopSequence) %>%
  do(first_last(.)) %>%
  ungroup

## Source: local data frame [6 x 3]
## 
##   id stopId stopSequence
## 1  1      a            1
## 2  1      c            3
## 3  2      b            1
## 4  2      c            4
## 5  3      b            1
## 6  3      a            3

With do you can pretty much perform any number of operations on the group but @jeremycg's answer is way more appropriate for just this task.


I know the question specified dplyr. But, since others already posted solutions using other packages, I decided to have a go using other packages too:

Base package:

df <- df[with(df, order(id, stopSequence, stopId)), ]
merge(df[!duplicated(df$id), ], 
      df[!duplicated(df$id, fromLast = TRUE), ], 
      all = TRUE)

data.table:

df <-  setDT(df)
df[order(id, stopSequence)][, .SD[c(1,.N)], by=id]

sqldf:

library(sqldf)
min <- sqldf("SELECT id, stopId, min(stopSequence) AS StopSequence
      FROM df GROUP BY id 
      ORDER BY id, StopSequence, stopId")
max <- sqldf("SELECT id, stopId, max(stopSequence) AS StopSequence
      FROM df GROUP BY id 
      ORDER BY id, StopSequence, stopId")
sqldf("SELECT * FROM min
      UNION
      SELECT * FROM max")

In one query:

sqldf("SELECT * 
        FROM (SELECT id, stopId, min(stopSequence) AS StopSequence
              FROM df GROUP BY id 
              ORDER BY id, StopSequence, stopId)
        UNION
        SELECT *
        FROM (SELECT id, stopId, max(stopSequence) AS StopSequence
              FROM df GROUP BY id 
              ORDER BY id, StopSequence, stopId)")

Output:

  id stopId StopSequence
1  1      a            1
2  1      c            3
3  2      b            1
4  2      c            4
5  3      a            3
6  3      b            1

Using data.table:

# convert to data.table
setDT(df) 
# order, group, filter
df[order(stopSequence)][, .SD[c(1, .N)], by = id]

   id stopId stopSequence
1:  1      a            1
2:  1      c            3
3:  2      b            1
4:  2      c            4
5:  3      b            1
6:  3      a            3

Another approach with lapply and a dplyr statement. We can apply an arbitrary number of whatever summary functions to the same statement:

lapply(c(first, last), 
       function(x) df %>% group_by(id) %>% summarize_all(funs(x))) %>% 
bind_rows()

You could for example be interested in rows with the max stopSequence value as well and do:

lapply(c(first, last, max("stopSequence")), 
       function(x) df %>% group_by(id) %>% summarize_all(funs(x))) %>%
bind_rows()

A different base R alternative would be to first order by id and stopSequence, split them based on id and for every id we select only the first and last index and subset the dataframe using those indices.

df[sapply(with(df, split(order(id, stopSequence), id)), function(x) 
                   c(x[1], x[length(x)])), ]


#  id stopId stopSequence
#1  1      a            1
#3  1      c            3
#5  2      b            1
#6  2      c            4
#8  3      b            1
#7  3      a            3

Or similar using by

df[unlist(with(df, by(order(id, stopSequence), id, function(x) 
                   c(x[1], x[length(x)])))), ]

참고URL : https://stackoverflow.com/questions/31528981/select-first-and-last-row-from-grouped-data

반응형