Text Mining of Permanent Downhole Gauges
Vignette Author
2018-09-22
library(petro.One)
library(tm)
## Loading required package: NLP
my_url <- make_search_url(query = "Permanent Downhole Gauge",
how = "all")
get_papers_count(my_url) # how many papers total
## [1] 575
papers_by_type(my_url) # papers by type
## # A tibble: 3 x 2
## name value
## <chr> <dbl>
## 1 Conference paper 487
## 2 Journal paper 84
## 3 Presentation 4
# create a dataframe of papers found
df <- read_multidoc(my_url)
df
## # A tibble: 575 x 6
## book_title paper_id dc_type authors year source
## <fct> <fct> <fct> <chr> <int> <fct>
## 1 Permanent Downhole G~ SPE-1226~ confere~ Horng, Chen Jiun~ 2009 SPE
## 2 Reservoir Management~ SPE-9097~ confere~ de Oliveira Silv~ 2004 SPE
## 3 Wavelet Filtering of~ SPE-1230~ confere~ Pico, Carlos, | ~ 2009 SPE
## 4 Pressure Transient A~ SPE-1637~ confere~ Al-hashim, Hasan~ 2013 SPE
## 5 Encouraging Experien~ SPE-1283~ confere~ Igbokoyi, A.O., ~ 2009 SPE
## 6 Comparative Analysis~ SPE-1724~ confere~ Enyekwe, A.E., U~ 2014 SPE
## 7 Interpreting Pressur~ SPE-1472~ confere~ Liu, Yang, Stanf~ 2011 SPE
## 8 Pressure Transient A~ SPE-1891~ confere~ Zubarev, Denis, ~ 2017 SPE
## 9 Analyzing Transient ~ SPE-1075~ confere~ Zheng, Shiyi, He~ 2007 SPE
## 10 Analyzing Simultaneo~ SPE-1100~ confere~ Rai, Himansu, Ch~ 2007 SPE
## # ... with 565 more rows
library(petro.One)
term_freq <- term_frequency(df)
term_freq
## # A tibble: 1,556 x 2
## word freq
## <chr> <int>
## 1 reservoir 134
## 2 well 122
## 3 data 101
## 4 field 89
## 5 pressure 88
## 6 production 85
## 7 downhole 84
## 8 gas 74
## 9 permanent 71
## 10 analysis 68
## # ... with 1,546 more rows
library(petro.One)
plot_wordcloud(df, max.words = 100, min.freq = 15)

Bar plot
plot_bars(df, min.freq = 25)

dendogram
plot_relationships(df, min.freq = 25, threshold = 0.1)

library(cluster)
tdm <- get_term_document_matrix(df)$tdm
tdm.rst <- removeSparseTerms(tdm, 0.93)
d <- dist(tdm.rst, method="euclidian")
fit <- hclust(d=d, method="complete") # for a different look try substituting: method="ward.D"
fit
##
## Call:
## hclust(d = d, method = "complete")
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 17
