2023-10-02 10:52:45 +00:00
<!DOCTYPE html>
< html xmlns = "http://www.w3.org/1999/xhtml" lang = "en" xml:lang = "en" > < head >
< meta charset = "utf-8" >
< meta name = "generator" content = "quarto-1.3.450" >
< meta name = "viewport" content = "width=device-width, initial-scale=1.0, user-scalable=yes" >
2023-10-12 16:08:37 +00:00
< title > Hacking Religion: TRS & Data Science in Action - 4 Data scraping, corpus analysis and wordclouds< / title >
2023-10-02 10:52:45 +00:00
< style >
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
div.columns{display: flex; gap: min(4vw, 1.5em);}
div.column{flex: auto; overflow-x: auto;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
ul.task-list li input[type="checkbox"] {
width: 0.8em;
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
vertical-align: middle;
}
/* CSS for citations */
div.csl-bib-body { }
div.csl-entry {
clear: both;
}
.hanging-indent div.csl-entry {
margin-left:2em;
text-indent:-2em;
}
div.csl-left-margin {
min-width:2em;
float:left;
}
div.csl-right-inline {
margin-left:2em;
padding-left:1em;
}
div.csl-indent {
margin-left: 2em;
}< / style >
< script src = "site_libs/quarto-nav/quarto-nav.js" > < / script >
< script src = "site_libs/quarto-nav/headroom.min.js" > < / script >
< script src = "site_libs/clipboard/clipboard.min.js" > < / script >
< script src = "site_libs/quarto-search/autocomplete.umd.js" > < / script >
< script src = "site_libs/quarto-search/fuse.min.js" > < / script >
< script src = "site_libs/quarto-search/quarto-search.js" > < / script >
< meta name = "quarto:offset" content = "./" >
2023-10-12 16:08:37 +00:00
< link href = "./chapter_5.html" rel = "next" >
2023-10-02 10:52:45 +00:00
< link href = "./chapter_3.html" rel = "prev" >
< script src = "site_libs/quarto-html/quarto.js" > < / script >
< script src = "site_libs/quarto-html/popper.min.js" > < / script >
< script src = "site_libs/quarto-html/tippy.umd.min.js" > < / script >
< script src = "site_libs/quarto-html/anchor.min.js" > < / script >
< link href = "site_libs/quarto-html/tippy.css" rel = "stylesheet" >
< link href = "site_libs/quarto-html/quarto-syntax-highlighting.css" rel = "stylesheet" id = "quarto-text-highlighting-styles" >
< script src = "site_libs/bootstrap/bootstrap.min.js" > < / script >
< link href = "site_libs/bootstrap/bootstrap-icons.css" rel = "stylesheet" >
< link href = "site_libs/bootstrap/bootstrap.min.css" rel = "stylesheet" id = "quarto-bootstrap" data-mode = "light" >
< script id = "quarto-search-options" type = "application/json" > {
"location": "sidebar",
"copy-button": false,
"collapse-after": 3,
"panel-placement": "start",
"type": "textbox",
"limit": 20,
"language": {
"search-no-results-text": "No results",
"search-matching-documents-text": "matching documents",
"search-copy-link-title": "Copy link to search",
"search-hide-matches-text": "Hide additional matches",
"search-more-match-text": "more match in this document",
"search-more-matches-text": "more matches in this document",
"search-clear-button-title": "Clear",
"search-detached-cancel-button-title": "Cancel",
"search-submit-button-title": "Submit",
"search-label": "Search"
}
}< / script >
< / head >
< body class = "nav-sidebar floating" >
< div id = "quarto-search-results" > < / div >
< header id = "quarto-header" class = "headroom fixed-top" >
< nav class = "quarto-secondary-nav" >
< div class = "container-fluid d-flex" >
< button type = "button" class = "quarto-btn-toggle btn" data-bs-toggle = "collapse" data-bs-target = "#quarto-sidebar,#quarto-sidebar-glass" aria-controls = "quarto-sidebar" aria-expanded = "false" aria-label = "Toggle sidebar navigation" onclick = "if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }" >
< i class = "bi bi-layout-text-sidebar-reverse" > < / i >
< / button >
2023-10-12 16:08:37 +00:00
< nav class = "quarto-page-breadcrumbs" aria-label = "breadcrumb" > < ol class = "breadcrumb" > < li class = "breadcrumb-item" > < a href = "./chapter_4.html" > < span class = "chapter-number" > 4< / span > < span class = "chapter-title" > Data scraping, corpus analysis and wordclouds< / span > < / a > < / li > < / ol > < / nav >
2023-10-02 10:52:45 +00:00
< a class = "flex-grow-1" role = "button" data-bs-toggle = "collapse" data-bs-target = "#quarto-sidebar,#quarto-sidebar-glass" aria-controls = "quarto-sidebar" aria-expanded = "false" aria-label = "Toggle sidebar navigation" onclick = "if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }" >
< / a >
< button type = "button" class = "btn quarto-search-button" aria-label = "" onclick = "window.quartoOpenSearch();" >
< i class = "bi bi-search" > < / i >
< / button >
< / div >
< / nav >
< / header >
<!-- content -->
< div id = "quarto-content" class = "quarto-container page-columns page-rows-contents page-layout-article" >
<!-- sidebar -->
< nav id = "quarto-sidebar" class = "sidebar collapse collapse-horizontal sidebar-navigation floating overflow-auto" >
< div class = "pt-lg-2 mt-2 text-left sidebar-header" >
< div class = "sidebar-title mb-0 py-0" >
< a href = "./" > Hacking Religion: TRS & Data Science in Action< / a >
< / div >
< / div >
< div class = "mt-2 flex-shrink-0 align-items-center" >
< div class = "sidebar-search" >
< div id = "quarto-search" class = "" title = "Search" > < / div >
< / div >
< / div >
< div class = "sidebar-menu-container" >
< ul class = "list-unstyled mt-1" >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./index.html" class = "sidebar-item-text sidebar-link" >
2023-10-12 16:08:37 +00:00
< span class = "menu-text" > Introduction: Hacking Religion< / span > < / a >
2023-10-02 10:52:45 +00:00
< / div >
< / li >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./chapter_1.html" class = "sidebar-item-text sidebar-link" >
2023-10-12 16:08:37 +00:00
< span class = "menu-text" > < span class = "chapter-number" > 1< / span > < span class = "chapter-title" > The 2021 UK Census< / span > < / span > < / a >
2023-10-02 10:52:45 +00:00
< / div >
< / li >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./chapter_2.html" class = "sidebar-item-text sidebar-link" >
2023-10-12 16:08:37 +00:00
< span class = "menu-text" > < span class = "chapter-number" > 2< / span > < span class = "chapter-title" > Survey Data: Spotlight Project< / span > < / span > < / a >
2023-10-02 10:52:45 +00:00
< / div >
< / li >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./chapter_3.html" class = "sidebar-item-text sidebar-link" >
2023-10-12 16:08:37 +00:00
< span class = "menu-text" > < span class = "chapter-number" > 3< / span > < span class = "chapter-title" > Mapping churches: geospatial data science< / span > < / span > < / a >
2023-10-02 10:52:45 +00:00
< / div >
< / li >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./chapter_4.html" class = "sidebar-item-text sidebar-link active" >
2023-10-12 16:08:37 +00:00
< span class = "menu-text" > < span class = "chapter-number" > 4< / span > < span class = "chapter-title" > Data scraping, corpus analysis and wordclouds< / span > < / span > < / a >
< / div >
< / li >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./chapter_5.html" class = "sidebar-item-text sidebar-link" >
< span class = "menu-text" > < span class = "chapter-number" > 5< / span > < span class = "chapter-title" > What’ s next?< / span > < / span > < / a >
2023-10-02 10:52:45 +00:00
< / div >
< / li >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./summary.html" class = "sidebar-item-text sidebar-link" >
< span class = "menu-text" > < span class = "chapter-number" > 6< / span > < span class = "chapter-title" > Summary< / span > < / span > < / a >
< / div >
< / li >
< li class = "sidebar-item" >
< div class = "sidebar-item-container" >
< a href = "./references.html" class = "sidebar-item-text sidebar-link" >
< span class = "menu-text" > References< / span > < / a >
< / div >
< / li >
< / ul >
< / div >
< / nav >
< div id = "quarto-sidebar-glass" data-bs-toggle = "collapse" data-bs-target = "#quarto-sidebar,#quarto-sidebar-glass" > < / div >
<!-- margin - sidebar -->
< div id = "quarto-margin-sidebar" class = "sidebar margin-sidebar" >
< nav id = "TOC" role = "doc-toc" class = "toc-active" >
< h2 id = "toc-title" > Table of contents< / h2 >
< ul >
< li > < a href = "#references" id = "toc-references" class = "nav-link active" data-scroll-target = "#references" > References< / a > < / li >
< / ul >
< / nav >
< / div >
<!-- main -->
< main class = "content" id = "quarto-document-content" >
< header id = "title-block-header" class = "quarto-title-block default" >
< div class = "quarto-title" >
2023-10-12 16:08:37 +00:00
< h1 class = "title" > < span class = "chapter-number" > 4< / span > < span class = "chapter-title" > Data scraping, corpus analysis and wordclouds< / span > < / h1 >
2023-10-02 10:52:45 +00:00
< / div >
< div class = "quarto-title-meta" >
< / div >
< / header >
< section id = "references" class = "level1 unnumbered" >
< h1 class = "unnumbered" > References< / h1 >
< div id = "refs" role = "list" style = "display: none" >
< / div >
< / section >
< / main > <!-- /main -->
< script id = "quarto-html-after-body" type = "application/javascript" >
window.document.addEventListener("DOMContentLoaded", function (event) {
const toggleBodyColorMode = (bsSheetEl) => {
const mode = bsSheetEl.getAttribute("data-mode");
const bodyEl = window.document.querySelector("body");
if (mode === "dark") {
bodyEl.classList.add("quarto-dark");
bodyEl.classList.remove("quarto-light");
} else {
bodyEl.classList.add("quarto-light");
bodyEl.classList.remove("quarto-dark");
}
}
const toggleBodyColorPrimary = () => {
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
if (bsSheetEl) {
toggleBodyColorMode(bsSheetEl);
}
}
toggleBodyColorPrimary();
const icon = "";
const anchorJS = new window.AnchorJS();
anchorJS.options = {
placement: 'right',
icon: icon
};
anchorJS.add('.anchored');
const isCodeAnnotation = (el) => {
for (const clz of el.classList) {
if (clz.startsWith('code-annotation-')) {
return true;
}
}
return false;
}
const clipboard = new window.ClipboardJS('.code-copy-button', {
text: function(trigger) {
const codeEl = trigger.previousElementSibling.cloneNode(true);
for (const childEl of codeEl.children) {
if (isCodeAnnotation(childEl)) {
childEl.remove();
}
}
return codeEl.innerText;
}
});
clipboard.on('success', function(e) {
// button target
const button = e.trigger;
// don't keep focus
button.blur();
// flash "checked"
button.classList.add('code-copy-button-checked');
var currentTitle = button.getAttribute("title");
button.setAttribute("title", "Copied!");
let tooltip;
if (window.bootstrap) {
button.setAttribute("data-bs-toggle", "tooltip");
button.setAttribute("data-bs-placement", "left");
button.setAttribute("data-bs-title", "Copied!");
tooltip = new bootstrap.Tooltip(button,
{ trigger: "manual",
customClass: "code-copy-button-tooltip",
offset: [0, -8]});
tooltip.show();
}
setTimeout(function() {
if (tooltip) {
tooltip.hide();
button.removeAttribute("data-bs-title");
button.removeAttribute("data-bs-toggle");
button.removeAttribute("data-bs-placement");
}
button.setAttribute("title", currentTitle);
button.classList.remove('code-copy-button-checked');
}, 1000);
// clear code selection
e.clearSelection();
});
function tippyHover(el, contentFn) {
const config = {
allowHTML: true,
content: contentFn,
maxWidth: 500,
delay: 100,
arrow: false,
appendTo: function(el) {
return el.parentElement;
},
interactive: true,
interactiveBorder: 10,
theme: 'quarto',
placement: 'bottom-start'
};
window.tippy(el, config);
}
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
for (var i=0; i< noterefs.length ; i + + ) {
const ref = noterefs[i];
tippyHover(ref, function() {
// use id or data attribute instead here
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
try { href = new URL(href).hash; } catch {}
const id = href.replace(/^#\/?/, "");
const note = window.document.getElementById(id);
return note.innerHTML;
});
}
let selectedAnnoteEl;
const selectorForAnnotation = ( cell, annotation) => {
let cellAttr = 'data-code-cell="' + cell + '"';
let lineAttr = 'data-code-annotation="' + annotation + '"';
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
return selector;
}
const selectCodeLines = (annoteEl) => {
const doc = window.document;
const targetCell = annoteEl.getAttribute("data-target-cell");
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
const lineIds = lines.map((line) => {
return targetCell + "-" + line;
})
let top = null;
let height = null;
let parent = null;
if (lineIds.length > 0) {
//compute the position of the single el (top and bottom and make a div)
const el = window.document.getElementById(lineIds[0]);
top = el.offsetTop;
height = el.offsetHeight;
parent = el.parentElement.parentElement;
if (lineIds.length > 1) {
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
height = bottom - top;
}
if (top !== null & & height !== null & & parent !== null) {
// cook up a div (if necessary) and position it
let div = window.document.getElementById("code-annotation-line-highlight");
if (div === null) {
div = window.document.createElement("div");
div.setAttribute("id", "code-annotation-line-highlight");
div.style.position = 'absolute';
parent.appendChild(div);
}
div.style.top = top - 2 + "px";
div.style.height = height + 4 + "px";
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
if (gutterDiv === null) {
gutterDiv = window.document.createElement("div");
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
gutterDiv.style.position = 'absolute';
const codeCell = window.document.getElementById(targetCell);
const gutter = codeCell.querySelector('.code-annotation-gutter');
gutter.appendChild(gutterDiv);
}
gutterDiv.style.top = top - 2 + "px";
gutterDiv.style.height = height + 4 + "px";
}
selectedAnnoteEl = annoteEl;
}
};
const unselectCodeLines = () => {
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
elementsIds.forEach((elId) => {
const div = window.document.getElementById(elId);
if (div) {
div.remove();
}
});
selectedAnnoteEl = undefined;
};
// Attach click handler to the DT
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
for (const annoteDlNode of annoteDls) {
annoteDlNode.addEventListener('click', (event) => {
const clickedEl = event.target;
if (clickedEl !== selectedAnnoteEl) {
unselectCodeLines();
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
if (activeEl) {
activeEl.classList.remove('code-annotation-active');
}
selectCodeLines(clickedEl);
clickedEl.classList.add('code-annotation-active');
} else {
// Unselect the line
unselectCodeLines();
clickedEl.classList.remove('code-annotation-active');
}
});
}
const findCites = (el) => {
const parentEl = el.parentElement;
if (parentEl) {
const cites = parentEl.dataset.cites;
if (cites) {
return {
el,
cites: cites.split(' ')
};
} else {
return findCites(el.parentElement)
}
} else {
return undefined;
}
};
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
for (var i=0; i< bibliorefs.length ; i + + ) {
const ref = bibliorefs[i];
const citeInfo = findCites(ref);
if (citeInfo) {
tippyHover(citeInfo.el, function() {
var popup = window.document.createElement('div');
citeInfo.cites.forEach(function(cite) {
var citeDiv = window.document.createElement('div');
citeDiv.classList.add('hanging-indent');
citeDiv.classList.add('csl-entry');
var biblioDiv = window.document.getElementById('ref-' + cite);
if (biblioDiv) {
citeDiv.innerHTML = biblioDiv.innerHTML;
}
popup.appendChild(citeDiv);
});
return popup.innerHTML;
});
}
}
});
< / script >
< nav class = "page-navigation" >
< div class = "nav-page nav-page-previous" >
< a href = "./chapter_3.html" class = "pagination-link" >
2023-10-12 16:08:37 +00:00
< i class = "bi bi-arrow-left-short" > < / i > < span class = "nav-page-text" > < span class = "chapter-number" > 3< / span > < span class = "chapter-title" > Mapping churches: geospatial data science< / span > < / span >
2023-10-02 10:52:45 +00:00
< / a >
< / div >
< div class = "nav-page nav-page-next" >
2023-10-12 16:08:37 +00:00
< a href = "./chapter_5.html" class = "pagination-link" >
< span class = "nav-page-text" > < span class = "chapter-number" > 5< / span > < span class = "chapter-title" > What’ s next?< / span > < / span > < i class = "bi bi-arrow-right-short" > < / i >
2023-10-02 10:52:45 +00:00
< / a >
< / div >
< / nav >
< / div > <!-- /content -->
< / body > < / html >