The hardware and bandwidth for this mirror is donated by METANET, the Webhosting and Full Service-Cloud Provider.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]metanet.ch.

Examples with wiki_utils

Angel Zazo, Department of Computer Science and Automatics, University of Salamanca

2024-07-26

Functions

Functions to obtain a list of Wikidata entities

w_SearchByLabel(string, mode=‘inlabel’, langs=““, langsorder=’’, instanceof=”“, Pproperty=”“, debug=FALSE)

w_SearchByOccupation(Qoc, mode=c(‘default’,‘count’,‘wikipedias’), langsorder=’‘, wikilangs=’’, nlimit=10000, debug=FALSE)

Function to obtain information from a list of Wikidata entities or a single one.

w_isInstanceOf(entity_list, instanceof=’’, nlimit=50000, debug=FALSE)

w_Wikipedias(entity_list, wikilangs=““, instanceof=’’, nlimit=1500, debug=FALSE)

w_isValid(entity_list, nlimit=50000, debug=FALSE)

w_Property(entity_list, Pproperty, includeQ=FALSE, langsorder=‘en’, nlimit=10000, debug=FALSE)

w_SearchByAuthority(Pauthority, langsorder=’‘, instanceof=’’, nlimit=10000, debug=FALSE)

Pauthority = Authority Database Property in Wikidata

w_EntityInfo(entity_list, mode=‘default’, langsorder=’’, wikilangs=““, nlimit=MW_LIMIT, debug=FALSE)

Functions to obtain information using the WikiMedia API’s

m_Opensearch(string, project=‘en.wikipedia.org’, profile=“engine_autoselect”, redirects=“resolve”)

m_reqMediaWiki(titles, mode=c(‘wikidataEntity’,‘redirects’,‘pagePrimaryImage’,‘pageFiles’), project=‘en.wikipedia.org’, redirects=TRUE, exclude_ext=‘svg|webp|xcf’)

m_Pageviews(article, start, end, project=“en.wikipedia.org”, access=“all-access”, agent=“user”, granularity=“monthly”, redirects=FALSE)

m_XtoolsInfo(article, infotype=c(“articleinfo”, “prose”, “links”), project=“en.wikipedia.org”, redirects=FALSE)

Functions to obtain information (viafID or cluster records) using the VIAF API

v_AutoSuggest(author) : obtains viafID

v_Search(CQL_Query, mode=c(‘default’, ‘anyField’, ‘allmainHeadingEl’, ‘allNames’, ‘allPersonalNames’, ‘allTitle’), schema=c(‘brief’, ‘JSON’)) : obtains clusters records

Function to retrieve a cluster record using the viafID.

v_GetRecord(viafid, record_format=‘viaf.json’): retrieve a cluster record

Function to extract information from a VIAF cluster record

v_Extract(viaf, info, source=NULL)

Package installation and loading

To install and load the updated version of the wikiTools package simply run the following commands:

install.packages("wikiTools")
library(wikiTools)

Examples of Wikidata functions using WDQS

Search string “Iranzo” in different positions

Exact search in Label or exact search in AltLabel (case sensitive and diacritics)

Optional: limit by instanceof Wikidata class (Qxx).

Optional: return information of some properties (Pproperties, Pxxx).

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570')

Search at the beginning in Label or AltLabel (diacritics and case are ignored)

df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en', instanceof = 'Q5',
                      mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='es|en',
                      instanceof = 'Q5|Q101352', mode='startswith')
df <- w_SearchByLabel(string='Iranzo', lang='en', langsorder='en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='startswith')

Search in any position in Label or AltLabel (diacritics and case are ignored)

If lang==’’ search in any language, else the search is performed only in the language indicated.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')

Search only in Chinese (Simplified) (language code: zh):

df <- w_SearchByLabel(string='Iranzo', langsorder='zh|es', lang='zh', mode='inlabel')

Optional instanceof and Property

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5|Q101352',
                      mode='inlabel')
df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', instanceof = 'Q5',
                      Pproperty = 'P21|P569|P570', mode='inlabel')

aux: getting a vector of entities (l) to use later.

df <- w_SearchByLabel(string='Iranzo', langsorder='es|en', mode='inlabel')
l <- df$entity

w_isInstanceOf

Check if elements in entity_list are instance of a Wikimedia class

df <- w_isInstanceOf(entity_list=l, instanceof='Q5')
# Not TRUE
df[!df$instanceof_Q5,]
##                entity       instanceof instanceof_Q5
## Q6058550     Q6058550   Q16560|Q133215         FALSE
## Q11912738   Q11912738            Q3947         FALSE
## Q31835108   Q31835108        Q24529780         FALSE
## Q45976259   Q45976259          Q101352         FALSE
## Q45987474   Q45987474         Q4167410         FALSE
## Q47034606   Q47034606         Q1642895         FALSE
## Q83296470   Q83296470                          FALSE
## Q85684513   Q85684513 Q28564|Q12317349         FALSE
## Q97101007   Q97101007          Q245117         FALSE
## Q97101009   Q97101009          Q245117         FALSE
## Q111015546 Q111015546             Q571         FALSE
## Q117783790 Q117783790          Q811430         FALSE
## Q125544306 Q125544306        Q47461344         FALSE
## Q125544313 Q125544313         Q3331189         FALSE

w_Wikipedias

Search for Wikipedia pages in all/some languages

Optional: instanceOF (limit to entities which are instance of a Wikidata class)

df <- w_Wikipedias(entity_list=l)
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr')
df <- w_Wikipedias(entity_list=l, wikilangs='es|en|fr', instanceof="Q5")

w_SearchByOccupation

Count entities, or get the entities with that occupation, also get Wikipedia pages

Note: depending on connection speed, nlimit parameter musts be adjusted

w_SearchByOccupation(Qoc="Q2306091", mode='count') # "Q2306091" Qoc for Sociologist
## [1] 19614
q <- w_SearchByOccupation(Qoc="Q2306091")
l <- q$entity
lw <- w_SearchByOccupation(Qoc='Q2306091', mode='wikipedias') # lw=dataframe
# We can obtain the same information using previous function w_Wikipedias:
lw2 <- w_Wikipedias(entity_list=l)
# Verifying:
all(lw['Q10320558','pages'] == lw2['Q10320558','pages'])
# Verifying:
all(sort(strsplit(lw['Q9061', 'pages'], '|', fixed = T)[[1]]) ==
    sort(strsplit(lw2['Q9061', 'pages'], '|', fixed = T)[[1]]))

w_isValid.

Check if the Wikidata entities are valid. A entity is valid if it has a label or has a description. If one entity exists but is not valid, is possible that it has a redirection to other entity, in that case, the redirection is obtained. Other entities may have existed in the past, but they are currently deleted.

l2 <- append(l, c("Q115637688", "Q105660123"))  # Note: adding two new entities
v <- w_isValid(l2)
# Not valid
v[!v$valid,]
##                entity valid instanceof redirection
## Q115637688 Q115637688 FALSE                       
## Q105660123 Q105660123 FALSE              Q97352588

w_Property

Obtain properties of entity_list.

p <- w_Property(l, Pproperty = 'P21|P569|P214', langsorder = 'es|en')

w_SearchByAuthority

Search for Wikidata entities that have an identifier in the Wikidata authority property “Pauthority”.

Optional: instanceOf

Example: Pauthority=P4439 (has identifier in the Museo Nacional Centro de Arte Reina Sofía)

mncars   <- w_SearchByAuthority(Pauthority="P4439", langsorder = 'es|en')
# 1286  [human, groups, etc.]
mncarsQ5 <- w_SearchByAuthority(Pauthority="P4439", langsorder = 'es|en',
                                     instanceof = 'Q5')  # 1280
# Entities are not 'human' (Q5) [see entityDescription column):
mncars[!(mncars$entity %in% mncarsQ5$entity),]  # not instance of Q5.
##                entity                                  entityLabel
## Q27657364   Q27657364 Midnight Gardening (Jardinería a medianoche)
## Q105687869 Q105687869            João Maria Gusmão and Pedro Paiva
## Q4517304     Q4517304                                  Chto Delat?
## Q317874       Q317874                                     Ant Farm
## Q20102460   Q20102460                        Agustín Parejo School
## Q5849776     Q5849776                                 Estrujenbank
##                                   entityDescription        instanceof
## Q27657364                 cuadro de Jerónimo Elespe          Q3305213
## Q105687869                  Portuguese artistic duo         Q85942930
## Q4517304                     Russian art collective          Q1400264
## Q317874    American art and architecture collective Q1400264|Q4387609
## Q20102460                                                    Q1400264
## Q5849776                        colectivo artístico          Q1400264
##                                          instanceofLabel                 P4439
## Q27657364                                        pintura       elespe-jeronimo
## Q105687869                                 duo artístico     gusmao-joao-maria
## Q4517304                           colectivo de artistas            chto-delat
## Q317874    colectivo de artistas|estudio de arquitectura              ant-farm
## Q20102460                          colectivo de artistas agustin-parejo-school
## Q5849776                           colectivo de artistas          estrujenbank

w_EntityInfo

Get some properties of a Wikidata entity.

df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en')
df <- w_EntityInfo(entity_list='Q134644', langsorder='es|en', wikilangs='es|en|fr')
df <- w_EntityInfo(c('Q270510', 'Q1675466', 'Q24871'), mode='film', langsorder='es|en', wikilangs='es|en|fr')
# Search string 'abba' inlabel
w <- w_SearchByLabel('abba', mode='inlabel', langsorder = '', instanceof = 'Q5')
df <- w_EntityInfo(w$entity, langsorder='en', wikilangs='en|es|fr', debug='info')
# Search 3D films
w <- w_SearchByInstanceof(instanceof='Q229390', langsorder = 'en|es', debug = 'info')
df <- w_EntityInfo(w$entity, mode="film", langsorder='en', wikilangs='en', debug='info')

Examples of WikiMedia functions

m_Opensearch

Search articles that contains any words (note: it is better to use a large string)

Some search profiles:

df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org',
                   profile="engine_autoselect", redirects="resolve")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="strict")
df <- m_Opensearch(string='Duque de Alba', project='es.wikipedia.org', profile="fuzzy")

m_reqMediaWiki

Checks if titles are in a Wikimedia project and returns the Wikidata entity for them, if they have one.

Note that URLdecode(“a%CC%8C”) is the letter “a” with the combining caron (ǎ)

df <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                        mode='wikidataEntity', project='en.wikipedia.org')

Obtains the redirections of a page (the page itself can be a redirect to other page).

Returns a vector for each title, in each vector the first element is the destiny, rest are all pages that redirect to it.

a <- m_reqMediaWiki(c('Cervantes', 'Planck', 'Noexiste'), mode='redirects',
                    project='es.wikipedia.org')
a
## $Cervantes
##  [1] "Miguel de Cervantes"            "Miguel de Cerbantes"           
##  [3] "Miguel de Cervantes y Saavedra" "Miguel De Cervantes y Saavedra"
##  [5] "El manco de Lepanto"            "Miguel de cervantes"           
##  [7] "Manco de Lepanto"               "Don Miguel de Cervantes"       
##  [9] "Cervantino"                     "Cervantina"                    
## [11] "Miguel de Cervantes Saavedra"   "Cervantes Saavedra, Miguel de" 
## [13] "Miguel de Cervantes y Cortinas" "Cervantesco"                   
## [15] "Cervántico"                     "Cervantes"                     
## 
## $Planck
## [1] "Max Planck"                   "Planck"                      
## [3] "Max Karl Ernst Ludwig Planck"
## 
## $Noexiste
## [1] NA

Gets the URL of de Primary image as a URL of Wikimedia pages.

Gets all URL of files inserted in the pages (images, sounds, videos…), using ‘|’ as separator, and excluding some extensions in the exclude_ext parameter.

Both functions automatically resolve redirects (the destiny is the “normalized” column of the data-frame returned).

i <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pagePrimaryImage')

f <- m_reqMediaWiki(c('Max Planck', URLdecode("a%CC%8C"), 'Max', 'Cervante', 'humanist'),
                  mode='pageFiles', exclude_ext = "svg|webp|xcf")

m_Pageviews

Gets visits that a page have had in a date interval

Optional: redirects

v <-  m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly")
vv <- m_Pageviews(article="Cervantes", start="20230101", end="20230501",
                   project="es.wikipedia.org", granularity="monthly",
                   redirects=TRUE)

m_XtoolsInfo

Obtains information (as vector) about an article in the Wikimedia project.

Infotype: articleinfo, prose, links

Optional: redirects

x <-  m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org")
xx <- m_XtoolsInfo(article="Cervantes", infotype="articleinfo", project="es.wikipedia.org",
                   redirects=TRUE)

y <-  m_XtoolsInfo(article="Miguel de Cervantes", infotype="links", project="es.wikipedia.org")
yy <- m_XtoolsInfo(article="Cervantes", infotype="links", project="es.wikipedia.org",
                    redirects=TRUE)

Gets all information (articleinfo, prose, links).

z  <- m_XtoolsInfo(article="Miguel de Cervantes", infotype="all", project="es.wikipedia.org")
zz <- m_XtoolsInfo(article="Cervantes", infotype="all", project="es.wikipedia.org",
                       redirects=TRUE)

Examples using VIAF functions

v_AutoSuggest

Searches authors. Sometimes the same author appears several times, under a different name).

Return a data-frame.

Important: The API returns a maximum of 10 records.

v_AutoSuggest('Iranzo')
##       term                                                  score  nametype  
##  [1,] "Iranzo, Antonio, 1930-2003"                          "1577" "personal"
##  [2,] "Iranzo, Carmen"                                      "1439" "personal"
##  [3,] "Iranzo, Miguel Lucas de"                             "1392" "personal"
##  [4,] "Iranzo, G., 1918-1998"                               "1375" "personal"
##  [5,] "Iranzo Muñío, María Teresa, 19..-"                   "1346" "personal"
##  [6,] "Iranzo Simón, Víctor 1850-1890"                      "1226" "personal"
##  [7,] "Iranzo, Olga"                                        "1220" "personal"
##  [8,] "Iranzo Benedito, Manuel, 1867-1921"                  "1214" "personal"
##  [9,] "Iranzo Bielsa, José, el Pastor de Andorra 1915-2016" "1170" "personal"
## [10,] "Iranzo Martín, Juan Emilio 1956-"                    "1162" "personal"
##       viafid                
##  [1,] "87262213"            
##  [2,] "46775630"            
##  [3,] "3268989"             
##  [4,] "88012748"            
##  [5,] "48297869"            
##  [6,] "87244676"            
##  [7,] "49150565569906250223"
##  [8,] "87100730"            
##  [9,] "63243927"            
## [10,] "58295559"
v_AutoSuggest('Esparza, María')
##       term                                                                    
##  [1,] "Esparza, María 1898-1978"                                              
##  [2,] "Esparza, María Jesús"                                                  
##  [3,] "Esparza, Mariana Ochoa"                                                
##  [4,] "Esparza, María Elena"                                                  
##  [5,] "Esparza, Maria, 19..-...., auteure d'une thèse de sciences biologiques"
##  [6,] "Esparza, María Del Rosario Campos-"                                    
##  [7,] "Esparza, María Sanjuana Salazar"                                       
##  [8,] "Esparza, María"                                                        
##  [9,] "Esparza, María del Carmen Hernández"                                   
## [10,] "Esparza, María de los Angeles Cervantes"                               
##       score  nametype   viafid                  
##  [1,] "1022" "personal" "1335154741632153110006"
##  [2,] "1013" "personal" "9147370712141442276"   
##  [3,] "1012" "personal" "67961098"              
##  [4,] "1012" "personal" "48986872"              
##  [5,] "1011" "personal" "250164959833724021614" 
##  [6,] "515"  "personal" "118153833232264332271" 
##  [7,] "509"  "personal" "121210664"             
##  [8,] "509"  "personal" "466160667844803560008" 
##  [9,] "508"  "personal" "103543869"             
## [10,] "508"  "personal" "31757197"
v_AutoSuggest('Escobar, Modesto')
##      term                                  score  nametype   viafid    
## [1,] "Escobar, Modesto, 1958-"             "1286" "personal" "75898534"
## [2,] "Escobar, Modesto"                    "1278" "personal" "75898534"
## [3,] "Escobar, Modesto, 1940-"             "1246" "personal" "6744770" 
## [4,] "Escobar, Modesto, (Escobar Espinar)" "628"  "personal" "6744770"
# Note that four rows are returned, but only two different viafids.

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.