The maximum number of rows that a OnePetro query can return is 1000. It means that the user could set up the query to return up to amximum of 1000 papers. Abover that number, the query to OnePetro will return error.
OnePetro has options to define the number of rows to display at 10, 50 and 100 rows. Additionally, through scripts like these, that number could be raised up to 1,000.
This article describes the process of reading multiple pages with thousand of papers to a unique dataframe.
Retrieve the most numerous paper by type
library(petro.One)
my_url <- make_search_url(query = "pressure transient analysis",
how = "all")
get_papers_count(my_url)
## [1] 4270
papers_by_type(my_url)
## # A tibble: 7 x 2
## name value
## <chr> <dbl>
## 1 Chapter 1
## 2 Conference paper 3260
## 3 General 61
## 4 Journal paper 935
## 5 Media 5
## 6 Other 1
## 7 Presentation 7
For the tyme being we will retrieve only conference papers.
# we use "conference-paper" only because other document types have
# different dataframe structure
my_url_1 <- make_search_url(query = "pressure transient analysis",
how = "all",
dc_type = "conference-paper",
start = 0,
rows = 1000)
get_papers_count(my_url_1)
## [1] 4270
page_1 <- read_onepetro(my_url_1)
htm_1 <- "pta-01-conference.html"
xml2::write_html(page_1, file = htm_1)
onepetro_page_to_dataframe(htm_1)
## # A tibble: 1,000 x 6
## book_title paper_id dc_type authors year source
## <fct> <fct> <fct> <chr> <int> <fct>
## 1 Pressure Transient ~ PETSOC-2~ confere~ Rabb, J., Petro-C~ 2003 PETSOC
## 2 Well-head Pressure ~ SPE-1648~ confere~ Spyrou, Charidimo~ 2013 SPE
## 3 Pressure Transient ~ SPE-2967~ confere~ Sahni, A., Univer~ 1995 SPE
## 4 Numerical Solutions~ SPE-2617~ confere~ Warren, G.M., SIM~ 1993 SPE
## 5 Pressure Transient ~ SPE-2838~ confere~ Larsen, Leif, Sta~ 1994 SPE
## 6 Integrating Pressur~ SPE-1063~ confere~ Rahim, Zillur, Re~ 2006 SPE
## 7 Automated Pressure ~ SPE-1443~ confere~ Rees, Hugh Richar~ 2011 SPE
## 8 How Wellbore Dynami~ PETSOC-9~ confere~ Mattar, L., Feket~ 1991 PETSOC
## 9 Pressure-Transient ~ SPE-4913~ confere~ Yildiz, Turhan, S~ 1998 SPE
## 10 Software Showcase: ~ SPE-2446~ confere~ Baldwin, J.O., Co~ 1992 SPE
## # ... with 990 more rows
my_url_2 <- make_search_url(query = "pressure transient analysis",
how = "all",
dc_type = "conference-paper",
start = 1000,
rows = 1000)
page_2 <- read_onepetro(my_url_2)
htm_2 <- "pta-02-conference.html"
xml2::write_html(page_2, file = htm_2)
onepetro_page_to_dataframe(htm_2)
## # A tibble: 1,000 x 6
## book_title paper_id dc_type authors year source
## <fct> <fct> <fct> <chr> <int> <fct>
## 1 Multi-Zone Waterflo~ SPE-18198~ confere~ Petrik, Artyom, ~ 2016 SPE
## 2 Diagnosis And Chara~ SPE-17499~ confere~ Anisur Rahman, N~ 2015 SPE
## 3 New Approach Using ~ SPE-18856~ confere~ Uematsu, H., Zak~ 2017 SPE
## 4 Enhancing Smart Com~ SPE-18324~ confere~ Hussain, Asim, A~ 2016 SPE
## 5 Contribution of Tid~ SPE-18883~ confere~ Faidouzi, Mohame~ 2017 SPE
## 6 Estimation of Reser~ SPE-18776~ confere~ Bobreneva, Yu. O~ 2017 SPE
## 7 Estimation of Reser~ SPE-18776~ confere~ Bobreneva, Yu. O~ 2017 SPE
## 8 A New Approach for ~ SPE-17079~ confere~ Pelling, Ross, B~ 2014 SPE
## 9 Automated Field Dev~ URTEC-216~ confere~ Tilke, Peter, Sc~ 2015 URTEC
## 10 An Integrated Appro~ SPE-18747~ confere~ Lati, Shrutesh, ~ 2017 SPE
## # ... with 990 more rows
my_url_3 <- make_search_url(query = "pressure transient analysis",
how = "all",
dc_type = "conference-paper",
start = 2000,
rows = 1000)
page_3 <- read_onepetro(my_url_3)
htm_3 <- "pta-03-conference.html"
xml2::write_html(page_3, file = htm_3)
onepetro_page_to_dataframe(htm_3)
## # A tibble: 1,000 x 6
## book_title paper_id dc_type authors year source
## <fct> <fct> <fct> <chr> <int> <fct>
## 1 The Effect of Long-t~ SPE-3049~ confere~ Bilden, D.M., BJ~ 1995 SPE
## 2 A Step Change in Dee~ SPE-1679~ confere~ Maizeret, Pierre~ 2014 SPE
## 3 Mechanisms and Main ~ SPE-9028~ confere~ Rodriguez, Ferna~ 2004 SPE
## 4 Success in Offshore ~ SPE-1621~ confere~ Stracke, M.L., A~ 1987 SPE
## 5 Decline-Curve Analys~ SPE-2293~ confere~ Aguilera, R., Se~ 1991 SPE
## 6 A New Fracturing Des~ SPE-1427~ confere~ Uetani, Takaaki,~ 2011 SPE
## 7 Application of Horiz~ IPTC-133~ confere~ Diyashev, Iskand~ 2009 IPTC
## 8 Breathing New Life I~ SPE-2553~ confere~ Al Zarafi, Ahmed~ 1993 SPE
## 9 Automatic Optimizati~ SPE-3707~ confere~ Buitrago, S., In~ 1996 SPE
## 10 Pore-Type Determinat~ SPE-1368~ confere~ Soto Becerra, Ro~ 2010 SPE
## # ... with 990 more rows
my_url_4 <- make_search_url(query = "pressure transient analysis",
how = "all",
dc_type = "conference-paper",
start = 3000,
rows = 100)
page_4 <- read_onepetro(my_url_4)
htm_4 <- "pta-04-conference.html"
xml2::write_html(page_4, file = htm_4)
onepetro_page_to_dataframe(htm_4)
## # A tibble: 100 x 6
## book_title paper_id dc_type authors year source
## <fct> <fct> <fct> <chr> <int> <fct>
## 1 Horizontal Well Eval~ SPE-2354~ confere~ Oosten, R.K.V., ~ 1991 SPE
## 2 Estimating Pore Pres~ SPE-5660~ confere~ Craig, David P.,~ 1999 SPE
## 3 Advantages In Joint-~ SPWLA-20~ confere~ Angeles, Renzo, ~ 2008 SPWLA
## 4 Analysis of Interfer~ SPE-8429~ confere~ Al-Khamis, M., C~ 2003 SPE
## 5 Advances in Geomecha~ ARMA-201~ confere~ Peng, Yan, China~ 2017 ARMA
## 6 Lessons Learned from~ SPE-1598~ confere~ Camilleri, Lawre~ 2012 SPE
## 7 Drilling Optimizatio~ SPE-1425~ confere~ Holdaway, Keith ~ 2011 SPE
## 8 Unsteady Flow to a W~ SPE-9902~ confere~ Raghavan, Rajago~ 1981 SPE
## 9 Formation Damage Ind~ SPE-1194~ confere~ Al-Anazi, Hamoud~ 2009 SPE
## 10 A Unified Mathematic~ SPE-1428~ confere~ Wu, Yu-Shu, Colo~ 2011 SPE
## # ... with 90 more rows
p1 <- onepetro_page_to_dataframe(htm_1)
p2 <- onepetro_page_to_dataframe(htm_2)
p3 <- onepetro_page_to_dataframe(htm_3)
p4 <- onepetro_page_to_dataframe(htm_4)
papers <- rbind(p1, p2, p3, p4)
papers
## # A tibble: 3,100 x 6
## book_title paper_id dc_type authors year source
## <fct> <fct> <fct> <chr> <int> <fct>
## 1 Pressure Transient ~ PETSOC-2~ confere~ Rabb, J., Petro-C~ 2003 PETSOC
## 2 Well-head Pressure ~ SPE-1648~ confere~ Spyrou, Charidimo~ 2013 SPE
## 3 Pressure Transient ~ SPE-2967~ confere~ Sahni, A., Univer~ 1995 SPE
## 4 Numerical Solutions~ SPE-2617~ confere~ Warren, G.M., SIM~ 1993 SPE
## 5 Pressure Transient ~ SPE-2838~ confere~ Larsen, Leif, Sta~ 1994 SPE
## 6 Integrating Pressur~ SPE-1063~ confere~ Rahim, Zillur, Re~ 2006 SPE
## 7 Automated Pressure ~ SPE-1443~ confere~ Rees, Hugh Richar~ 2011 SPE
## 8 How Wellbore Dynami~ PETSOC-9~ confere~ Mattar, L., Feket~ 1991 PETSOC
## 9 Pressure-Transient ~ SPE-4913~ confere~ Yildiz, Turhan, S~ 1998 SPE
## 10 Software Showcase: ~ SPE-2446~ confere~ Baldwin, J.O., Co~ 1992 SPE
## # ... with 3,090 more rows
pattern <- "pressure transient analysis"
rows <- grep(pattern = pattern, papers$title_data, ignore.case = TRUE)
## Warning: Unknown or uninitialised column: 'title_data'.
papers[rows, ]
## # A tibble: 0 x 6
## # ... with 6 variables: book_title <fct>, paper_id <fct>, dc_type <fct>,
## # authors <chr>, year <int>, source <fct>
# remove files that were created
files <- c(htm_1, htm_2, htm_3, htm_4)
file.remove(files)
## [1] TRUE TRUE TRUE TRUE