The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.

Examples with wiki_utils

Angel Zazo, Department of Computer Science and Automatics, University of Salamanca

2024-04-13

Functions

Functions to obtain a list of Wikidata entities

w_SearchByLabel(string, langsorder=‘en’, lang=““, instanceof=”“, Pproperty=”“, mode=c(”exact”,“startswith”,“inlabel”))

w_OccupationEntities(Qoc, nlimit=NULL, mode=c(‘default’,‘count’,‘wikipedias’))

Function to obtain information from a list of Wikidata entities or a single one.

w_isInstanceOf(entity_list, instanceof)

w_Wikipedias(entity_list, wikilangs=““, instanceof=”“, nlimit=1500)

w_isValid(entity_list, nlimit=50000)

w_Property(entity_list, Pproperty, langsorder=‘en’, nlimit=10000)

w_IdentifiersOfAuthority(Pauthority, langsorder=‘en’, instanceof=““)

Pauthority = Authority Database Property in Wikidata

w_EntityInfo(entity, langsorder=‘en’, wikilangs=““, mode=c(‘default’,‘tiny’,‘film’))

Functions to obtain information using the WikiMedia API’s

m_Opensearch(string, project=‘en.wikipedia.org’, profile=“engine_autoselect”, redirects=“resolve”)

m_reqMediaWiki(titles, mode=c(‘wikidataEntity’,‘redirects’,‘pagePrimaryImage’,‘pageFiles’), project=‘en.wikipedia.org’, redirects=TRUE, exclude_ext=‘svg|webp|xcf’)

m_Pageviews(article, start, end, project=“en.wikipedia.org”, access=“all-access”, agent=“user”, granularity=“monthly”, redirects=FALSE)

m_XtoolsInfo(article, infotype=“articleinfo”, project=“en.wikipedia.org”, redirects=FALSE)

Functions to obtain information (viafID or cluster records) using the VIAF API

v_AutoSuggest(author) : obtains viafID

v_Search(CQL_Query, mode=c(‘default’, ‘anyField’, ‘allmainHeadingEl’, ‘allNames’, ‘allPersonalNames’, ‘allTitle’)) : obtains clusters records

Function to retrieve a cluster record using the viafID.

v_GetRecord(viafid, record_format=‘viaf.json’): retrieve a cluster record

Function to extract information from a VIAF cluster record

v_Extract(viaf, info, source=NULL)

Package installation and loading

To install and load the updated version of the wikiTools package simply run the following commands:

install.packages("wikiTools")
library(wikiTools)

Examples of Wikidata functions using WDQS

Search string “Iranzo” in different positions

Exact search in Label or exact search in AltLabel (case sensitive and diacritics)

Optional: limit by instanceof Wikidata class (Qxx).

Optional: return information of some properties (Pproperties, Pxxx).

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570')

Search at the beginning in Label or AltLabel (diacritics and case are ignored)

df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', instanceof = 'Q5',
                      mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en',
                      instanceof = 'Q5|Q101352', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='startswith')

Search in any position in Label or AltLabel (diacritics and case are ignored)

If lang==’’ search in any language, else the search is performed only in the language indicated.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')

Search only in Chinese (Simplified) (language code: zh):

df <- w_SearchByLabel(string='Iranzo', langsorder='zh|es', lang='zh', mode='inlabel')

Optional instanceof and Property

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='inlabel')

aux: getting a vector of entities (l) to use later.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')
l <- df$entity

w_isInstanceOf

Check if elements in entity_list are instance of a Wikimedia class

df <- w_isInstanceOf(entity_list=l, instanceof='Q5')
# Not TRUE
df[!df$instanceof_Q5,]
##                entity instanceof_Q5
## Q45987474   Q45987474         FALSE
## Q85684513   Q85684513         FALSE
## Q117783790 Q117783790         FALSE
## Q47034606   Q47034606         FALSE
## Q45976259   Q45976259         FALSE
## Q11912738   Q11912738         FALSE
## Q97101009   Q97101009         FALSE
## Q111015546 Q111015546         FALSE
## Q97101007   Q97101007         FALSE
## Q31835108   Q31835108         FALSE
## Q6058550     Q6058550         FALSE

w_Wikipedias

Search for Wikipedia pages in all/some languages

Optional: instanceOF (limit to entities which are instance of a Wikidata class)

df <- w_Wikipedias(entity_list=l)
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr')
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr', instanceof="Q5")

w_Occupations

Count entities, or get the entities with that occupation, also get Wikipedia pages

Note: depending on connection speed, nlimit parameter musts be adjusted

w_OccupationEntities(Qoc='Q2306091', mode='count') # Qoc for Sociologist
## [1] 19308
l  <- w_OccupationEntities(Qoc='Q2306091') # l=entities: vector
lw <- w_OccupationEntities(Qoc='Q2306091', mode='wikipedias') # lw=dataframe
 # We can obtain the same information using previous function w_Wikipedias:
 lw2 <- w_Wikipedias(entity_list=l, wikilangs='')
 # Verifying:
 all(lw['Q10320558','pages'] == lw2['Q10320558','pages'])
 # Verifying:
 all(sort(strsplit(lw['Q9061', 'pages'], '|', fixed = T)[[1]]) ==
     sort(strsplit(lw2['Q9061', 'pages'], '|', fixed = T)[[1]]))

w_isValid.

Check if the Wikidata entities are valid. A entity is valid if it has a label or has a description. If one entity exists but is not valid, is possible that it has a redirection to other entity, in that case, the redirection is obtained. Other entities may have existed in the past, but they are currently deleted.

l2 <- append(l, c("Q115637688", "Q105660123"))  # Note: adding two new entities
v <- w_isValid(l2)
# Not valid
v[!v$valid,]
##                entity valid redirection
## Q115637688 Q115637688 FALSE            
## Q105660123 Q105660123 FALSE   Q97352588

w_Property

Obtain properties of entity_list.

p <- w_Property(l, Pproperty = 'P21|P569|P214', langsorder = 'es|en')

w_IdentifiersOfAuthority

Search for Wikidata entities that have an identifier in the Wikidata authority property “Pauthority”.

Optional: instanceOf

Example: Pauthority=P4439 (has identifier in the Museo Nacional Centro de Arte Reina Sofía)

mncars   <- w_IdentifiersOfAuthority(Pauthority="P4439", langsorder = 'es|en')
# 1286  [human, groups, etc.]
mncarsQ5 <- w_IdentifiersOfAuthority(Pauthority="P4439", langsorder = 'es|en',
                                     instanceof = 'Q5')  # 1280
# Entities are not 'human' (Q5) [see entityDescription column):
mncars[!(mncars$entity %in% mncarsQ5$entity),]  # not instance of Q5.
##                entity                                  entityLabel
## Q105687869 Q105687869            João Maria Gusmão and Pedro Paiva
## Q4517304     Q4517304                                  Chto Delat?
## Q5849776     Q5849776                                 Estrujenbank
## Q20102460   Q20102460                        Agustín Parejo School
## Q27657364   Q27657364 Midnight Gardening (Jardinería a medianoche)
## Q317874       Q317874                                     Ant Farm
##                                   entityDescription                 P4439
## Q105687869                  Portuguese artistic duo     gusmao-joao-maria
## Q4517304                     Russian art collective            chto-delat
## Q5849776                        colectivo artístico          estrujenbank
## Q20102460                                           agustin-parejo-school
## Q27657364                 cuadro de Jerónimo Elespe       elespe-jeronimo
## Q317874    American art and architecture collective              ant-farm

w_EntityInfo

Get some properties of a Wikidata entity.

df1 <- w_EntityInfo(entity='Q134644', langsorder = 'es|en')
# Also a "tiny" version
df2 <- w_EntityInfo(entity='Q134644', langsorder = 'es|en', mode='tiny')
# Differences: fields non existing in the tiny row set as "--":
Aleixandre <- rbind(
  df1,
  data.frame(c(df2, sapply(setdiff(names(df1), names(df2)), function(x) "--")),
             row.names = 'tiny')
)
BenHur    <- w_EntityInfo(entity='Q180098', langsorder='es|en',
                          wikilangs = 'es|fr', mode='film')
Nosferatu <- w_EntityInfo(entity='Q151895', langsorder='es|en',
                          wikilangs = 'es|fr|en', mode='film')
# Nosferatu has a public video:
Nosferatu$video
## [1] "http://commons.wikimedia.org/wiki/Special:FilePath/Nosferatu%20%281922%2C%20English%20titles%201947%29.webm"
# Combining data-frames:
films <- rbind(BenHur, Nosferatu)

Examples of WikiMedia functions

m_Opensearch

Search articles that contains any words (note: it is better to use a large string)

Some search profiles:

df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org',
                   profile="engine_autoselect", redirects="resolve")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="strict")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="fuzzy")

m_reqMediaWiki

Checks if titles are in a Wikimedia project and returns the Wikidata entity for them, if they have one.

Note that URLdecode(“a%CC%8C”) is the letter “a” with the combining caron (ǎ)

df <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                        mode='wikidataEntity', project='en.wikipedia.org')

Obtains the redirections of a page (the page itself can be a redirect to other page).

Returns a vector for each title, in each vector the first element is the destiny, rest are all pages that redirect to it.

a <- m_reqMediaWiki(c('Cervantes', 'Planck', 'Noexiste'), mode='redirects',
                    project='es.wikipedia.org')
a
## $Cervantes
##  [1] "Miguel de Cervantes"            "Miguel de Cerbantes"           
##  [3] "Miguel de Cervantes y Saavedra" "Miguel De Cervantes y Saavedra"
##  [5] "El manco de Lepanto"            "Miguel de cervantes"           
##  [7] "Manco de Lepanto"               "Don Miguel de Cervantes"       
##  [9] "Cervantino"                     "Cervantina"                    
## [11] "Miguel de Cervantes Saavedra"   "Cervantes Saavedra, Miguel de" 
## [13] "Miguel de Cervantes y Cortinas" "Cervantesco"                   
## [15] "Cervántico"                     "Cervantes"                     
## 
## $Planck
## [1] "Max Planck"                   "Planck"                      
## [3] "Max Karl Ernst Ludwig Planck"
## 
## $Noexiste
## [1] NA

Gets the URL of de Primary image as a URL of Wikimedia pages.

Gets all URL of files inserted in the pages (images, sounds, videos…), using ‘|’ as separator, and excluding some extensions in the exclude_ext parameter.

Both functions automatically resolve redirects (the destiny is the “normalized” column of the data-frame returned).

i <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pagePrimaryImage')

f <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pageFiles', exclude_ext = "svg|webp|xcf")

m_Pageviews

Gets visits that a page have had in a date interval

Optional: redirects

v <-  m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly")
vv <- m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly",
                   redirects=TRUE)

m_XtoolsInfo

Obtains information (as vector) about an article in the Wikimedia project.

Infotype: articleinfo, prose, links

Optional: redirects

x <-  m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org")
xx <- m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org",
                   redirects=TRUE)

y <-  m_XtoolsInfo(article="Miguel de Cervantes", infotype="links", project="es.wikipedia.org")
yy <- m_XtoolsInfo(article="Cervantes", infotype="links", project="es.wikipedia.org",
                    redirects=TRUE)

Gets all information (articleinfo, prose, links).

z  <- m_XtoolsInfo(article="Miguel de Cervantes", infotype="all", project="es.wikipedia.org")
zz <- m_XtoolsInfo(article="Cervantes", infotype="all", project="es.wikipedia.org",
                       redirects=TRUE)

Examples using VIAF functions

v_AutoSuggest

Searches authors. Sometimes the same author appears several times, under a different name).

Return a data-frame.

Important: The API returns a maximum of 10 records.

v_AutoSuggest('Iranzo')
##       term                                                  score  nametype  
##  [1,] "Iranzo, Antonio, 1930-2003"                          "1563" "personal"
##  [2,] "Iranzo, Carmen"                                      "1439" "personal"
##  [3,] "Iranzo, Miguel Lucas de"                             "1392" "personal"
##  [4,] "Iranzo, G., 1918-1998"                               "1365" "personal"
##  [5,] "Iranzo Muñío, María Teresa, 19..-"                   "1346" "personal"
##  [6,] "Iranzo Simón, Víctor 1850-1890"                      "1220" "personal"
##  [7,] "Iranzo Benedito, Manuel, 1867-1921"                  "1202" "personal"
##  [8,] "Iranzo, Olga"                                        "1190" "personal"
##  [9,] "Iranzo Bielsa, José, el Pastor de Andorra 1915-2016" "1170" "personal"
## [10,] "Iranzo Martín, Juan Emilio 1956-"                    "1162" "personal"
##       viafid                
##  [1,] "87262213"            
##  [2,] "46775630"            
##  [3,] "3268989"             
##  [4,] "88012748"            
##  [5,] "48297869"            
##  [6,] "87244676"            
##  [7,] "87100730"            
##  [8,] "49150565569906250223"
##  [9,] "63243927"            
## [10,] "58295559"
v_AutoSuggest('Esparza, María')
##       term                                                                    
##  [1,] "Esparza, María 1898-1978"                                              
##  [2,] "Esparza, María Jesús"                                                  
##  [3,] "Esparza, Mariana Ochoa"                                                
##  [4,] "Esparza, María Elena"                                                  
##  [5,] "Esparza, Maria, 19..-...., auteure d'une thèse de sciences biologiques"
##  [6,] "Esparza, María Del Rosario Campos-"                                    
##  [7,] "Esparza, María Sanjuana Salazar"                                       
##  [8,] "Esparza, María"                                                        
##  [9,] "Esparza, María del Carmen Hernández"                                   
## [10,] "Esparza, María de los Angeles Cervantes"                               
##       score  nametype   viafid                  
##  [1,] "1022" "personal" "1335154741632153110006"
##  [2,] "1013" "personal" "9147370712141442276"   
##  [3,] "1012" "personal" "67961098"              
##  [4,] "1012" "personal" "48986872"              
##  [5,] "1011" "personal" "250164959833724021614" 
##  [6,] "515"  "personal" "118153833232264332271" 
##  [7,] "509"  "personal" "121210664"             
##  [8,] "509"  "personal" "466160667844803560008" 
##  [9,] "508"  "personal" "103543869"             
## [10,] "508"  "personal" "31757197"
v_AutoSuggest('Escobar, Modesto')
##      term                                  score  nametype   viafid    
## [1,] "Escobar, Modesto, 1958-"             "1286" "personal" "75898534"
## [2,] "Escobar, Modesto"                    "1278" "personal" "75898534"
## [3,] "Escobar, Modesto, 1940-"             "1246" "personal" "6744770" 
## [4,] "Escobar, Modesto, (Escobar Espinar)" "628"  "personal" "6744770"
# Note that four rows are returned, but only two different viafids.

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.