Text Mining of Permanent Downhole Gauges

Vignette Author

2018-09-22

library(petro.One)
library(tm)
## Loading required package: NLP
my_url <- make_search_url(query = "Permanent Downhole Gauge", 
                          how = "all")        

get_papers_count(my_url)    # how many papers total
## [1] 575
papers_by_type(my_url)      # papers by type
## # A tibble: 3 x 2
##   name             value
##   <chr>            <dbl>
## 1 Conference paper   487
## 2 Journal paper       84
## 3 Presentation         4
# create a dataframe of papers found
df <- read_multidoc(my_url)
df
## # A tibble: 575 x 6
##    book_title            paper_id  dc_type  authors            year source
##    <fct>                 <fct>     <fct>    <chr>             <int> <fct> 
##  1 Permanent Downhole G~ SPE-1226~ confere~ Horng, Chen Jiun~  2009 SPE   
##  2 Reservoir Management~ SPE-9097~ confere~ de Oliveira Silv~  2004 SPE   
##  3 Wavelet Filtering of~ SPE-1230~ confere~ Pico, Carlos, | ~  2009 SPE   
##  4 Pressure Transient A~ SPE-1637~ confere~ Al-hashim, Hasan~  2013 SPE   
##  5 Encouraging Experien~ SPE-1283~ confere~ Igbokoyi, A.O., ~  2009 SPE   
##  6 Comparative Analysis~ SPE-1724~ confere~ Enyekwe, A.E., U~  2014 SPE   
##  7 Interpreting Pressur~ SPE-1472~ confere~ Liu, Yang, Stanf~  2011 SPE   
##  8 Pressure Transient A~ SPE-1891~ confere~ Zubarev, Denis, ~  2017 SPE   
##  9 Analyzing Transient ~ SPE-1075~ confere~ Zheng, Shiyi, He~  2007 SPE   
## 10 Analyzing Simultaneo~ SPE-1100~ confere~ Rai, Himansu, Ch~  2007 SPE   
## # ... with 565 more rows
library(petro.One)

term_freq <- term_frequency(df)
term_freq
## # A tibble: 1,556 x 2
##    word        freq
##    <chr>      <int>
##  1 reservoir    134
##  2 well         122
##  3 data         101
##  4 field         89
##  5 pressure      88
##  6 production    85
##  7 downhole      84
##  8 gas           74
##  9 permanent     71
## 10 analysis      68
## # ... with 1,546 more rows
library(petro.One)

plot_wordcloud(df, max.words = 100, min.freq = 15)

Bar plot

plot_bars(df, min.freq = 25)

dendogram

plot_relationships(df, min.freq = 25, threshold = 0.1)

library(cluster)   
tdm <- get_term_document_matrix(df)$tdm

tdm.rst <- removeSparseTerms(tdm, 0.93)

d <- dist(tdm.rst, method="euclidian")   
fit <- hclust(d=d, method="complete")   # for a different look try substituting: method="ward.D"
fit 
## 
## Call:
## hclust(d = d, method = "complete")
## 
## Cluster method   : complete 
## Distance         : euclidean 
## Number of objects: 17
plot(fit, hang = 1)