Function docx_summary
is returning content of a Word document.
library(officer)
example_docx <- system.file(package = "officer", "doc_examples/example.docx")
doc <- read_docx(example_docx)
content <- docx_summary(doc)
content
## # A tibble: 69 x 11
## doc_index content_type style_name
## <int> <chr> <chr>
## 1 1 paragraph heading 1
## 2 2 paragraph <NA>
## 3 3 paragraph heading 1
## 4 4 paragraph List Paragraph
## 5 5 paragraph List Paragraph
## 6 6 paragraph List Paragraph
## 7 7 paragraph heading 2
## 8 8 paragraph List Paragraph
## 9 9 paragraph List Paragraph
## 10 10 paragraph List Paragraph
## # ... with 59 more rows, and 8 more variables: text <chr>, level <dbl>,
## # num_id <int>, row_id <int>, is_header <lgl>, cell_id <dbl>,
## # col_span <dbl>, row_span <dbl>
Explore the results:
library(dplyr)
content %>% group_by(content_type) %>% summarise(n = n_distinct(doc_index))
## # A tibble: 2 x 2
## content_type n
## <chr> <int>
## 1 paragraph 17
## 2 table cell 1
To get all paragraphs:
par_data <- content %>% filter(content_type %in% "paragraph") %>%
select(doc_index, style_name, text, level, num_id) %>%
# let's make text shorter so it can be display in that vignette
mutate(text = substr(text, start = 1,
stop = ifelse(nchar(text)<30, nchar(text), 30) ))
par_data
## # A tibble: 17 x 5
## doc_index style_name text level num_id
## <int> <chr> <chr> <dbl> <int>
## 1 1 heading 1 Title 1 NA NA
## 2 2 <NA> Lorem ipsum dolor sit amet, co NA NA
## 3 3 heading 1 Title 2 NA NA
## 4 4 List Paragraph Quisque tristique 1 2
## 5 5 List Paragraph Augue nisi, et convallis 1 2
## 6 6 List Paragraph Sapien mollis nec. 1 2
## 7 7 heading 2 Sub title 1 NA NA
## 8 8 List Paragraph Quisque tristique 1 1
## 9 9 List Paragraph Augue nisi, et convallis 1 1
## 10 10 List Paragraph Sapien mollis nec. 1 1
## 11 11 <NA> NA NA
## 12 12 <NA> Phasellus nec nunc vitae nulla NA NA
## 13 13 heading 2 Sub title 2 NA NA
## 14 14 <NA> Morbi rhoncus sapien sit amet NA NA
## 15 15 <NA> NA NA
## 16 17 <NA> NA NA
## 17 18 <NA> NA NA
Tables are unstacked:
table_cells <- content %>% filter(content_type %in% "table cell")
print(table_cells)
## # A tibble: 52 x 11
## doc_index content_type style_name text level num_id row_id
## <int> <chr> <chr> <chr> <dbl> <int> <int>
## 1 16 table cell Light Shading Petals NA NA 1
## 2 16 table cell Light Shading 5,621498349 NA NA 2
## 3 16 table cell Light Shading 4,994616997 NA NA 3
## 4 16 table cell Light Shading 4,767504884 NA NA 4
## 5 16 table cell Light Shading 25,9242382 NA NA 5
## 6 16 table cell Light Shading 6,489375001 NA NA 6
## 7 16 table cell Light Shading 5,7858682 NA NA 7
## 8 16 table cell Light Shading 5,645575295 NA NA 8
## 9 16 table cell Light Shading 4,828953215 NA NA 9
## 10 16 table cell Light Shading 6,783500773 NA NA 10
## # ... with 42 more rows, and 4 more variables: is_header <lgl>,
## # cell_id <dbl>, col_span <dbl>, row_span <dbl>
Cells positions and values are dispatched in columns row_id
, cell_id
, text
and is_header
(a logical column indicating if the cell is part of header or not). Note that content (column text
) is a character vector.
table_body <- table_cells %>%
filter(!is_header) %>%
select(row_id, cell_id, text)
table_body
## # A tibble: 48 x 3
## row_id cell_id text
## <int> <dbl> <chr>
## 1 2 1 5,621498349
## 2 3 1 4,994616997
## 3 4 1 4,767504884
## 4 5 1 25,9242382
## 5 6 1 6,489375001
## 6 7 1 5,7858682
## 7 8 1 5,645575295
## 8 9 1 4,828953215
## 9 10 1 6,783500773
## 10 11 1 5,395076839
## # ... with 38 more rows
Reshape data with columns row_id
, cell_id
and text
, it’s easy to do with tidyr
:
if( require("tidyr"))
table_body %>% spread(cell_id, text)
## Loading required package: tidyr
## # A tibble: 12 x 5
## row_id `1` `2` `3`
## * <int> <chr> <chr> <chr>
## 1 2 5,621498349 <NA> 2,46210657918,2034091
## 2 3 4,994616997 AA 2,429320759
## 3 4 4,767504884 <NA> AAA
## 4 5 25,9242382 <NA> 2,066051345
## 5 6 6,489375001 25,21130805 2,901582763
## 6 7 5,7858682 25,52433147 2,655642742
## 7 8 5,645575295 Merged cell 2,278691288
## 8 9 4,828953215 <NA> 2,238467716
## 9 10 6,783500773 <NA> 2,202762147
## 10 11 5,395076839 <NA> 2,538375992
## 11 12 4,683617783 29,2459239 2,601945544
## 12 13 Note <NA> <NA>
## # ... with 1 more variables: `4` <chr>
Getting headers requires another operation:
if( require("tidyr"))
table_cells %>%
filter(is_header) %>%
select(row_id, cell_id, text) %>%
spread(cell_id, text)
## # A tibble: 1 x 5
## row_id `1` `2` `3` `4`
## * <int> <chr> <chr> <chr> <chr>
## 1 1 Petals Internode Sepal Bract
Function pptx_summary
is returning content of a PowerPoint document
example_pptx <- system.file(package = "officer", "doc_examples/example.pptx")
doc <- read_pptx(example_pptx)
content <- pptx_summary(doc)
content
## # A tibble: 55 x 9
## text id content_type slide_id row_id cell_id col_span
## <chr> <chr> <chr> <int> <int> <int> <dbl>
## 1 Title 12 paragraph 1 NA NA NA
## 2 A table 13 paragraph 1 NA NA NA
## 3 and some text 13 paragraph 1 NA NA NA
## 4 and some list (1) 13 paragraph 1 NA NA NA
## 5 and some list (2) 13 paragraph 1 NA NA NA
## 6 Header 1 18 table cell 1 1 1 1
## 7 Header 2 18 table cell 1 1 2 1
## 8 Header 3 18 table cell 1 1 3 1
## 9 A 18 table cell 1 2 1 1
## 10 12.23 18 table cell 1 2 2 1
## # ... with 45 more rows, and 2 more variables: row_span <dbl>,
## # media_file <chr>
Explore the results:
content %>% group_by(content_type) %>% summarise(n = n_distinct(id))
## # A tibble: 3 x 2
## content_type n
## <chr> <int>
## 1 image 1
## 2 paragraph 5
## 3 table cell 2
To get all paragraphs:
par_data <- content %>% filter(content_type %in% "paragraph") %>%
select(id, text)
par_data
## # A tibble: 13 x 2
## id text
## <chr> <chr>
## 1 12 Title
## 2 13 A table
## 3 13 and some text
## 4 13 and some list (1)
## 5 13 and some list (2)
## 6 15 R logo
## 7 2 Hi
## 8 3 This is
## 9 3 an unordered
## 10 3 list of paragraphs
## 11 3
## 12 3 This is an ordered
## 13 3 list of paragraphs
To get an image:
image_row <- content %>% filter(content_type %in% "image")
media_extract(doc, path = image_row$media_file, target = "extract.png")
## [1] TRUE
Tables are unstacked :
table_cells <- content %>% filter(content_type %in% "table cell")
table_cells
## # A tibble: 41 x 9
## text id content_type slide_id row_id cell_id col_span
## <chr> <chr> <chr> <int> <int> <int> <dbl>
## 1 Header 1 18 table cell 1 1 1 1
## 2 Header 2 18 table cell 1 1 2 1
## 3 Header 3 18 table cell 1 1 3 1
## 4 A 18 table cell 1 2 1 1
## 5 12.23 18 table cell 1 2 2 1
## 6 blah blah 18 table cell 1 2 3 1
## 7 B 18 table cell 1 3 1 1
## 8 1.23 18 table cell 1 3 2 1
## 9 blah blah blah 18 table cell 1 3 3 1
## 10 B 18 table cell 1 4 1 1
## # ... with 31 more rows, and 2 more variables: row_span <dbl>,
## # media_file <chr>
Cells positions and values are dispatched in columns row_id
, cell_id
, text
. Note here there is no indicator for table header.
if( require("tidyr"))
table_cells %>% filter( id == 18 ) %>%
select(row_id, cell_id, text) %>%
spread(cell_id, text)
## # A tibble: 5 x 4
## row_id `1` `2` `3`
## * <int> <chr> <chr> <chr>
## 1 1 Header 1 Header 2 Header 3
## 2 2 A 12.23 blah blah
## 3 3 B 1.23 blah blah blah
## 4 4 B 9.0 Salut
## 5 5 C 6 Hello