Section 3 Data collection
The data were collected using custom search and scraping scripts for each online community. The general approach was similiar for all four communities. A first script used the search function of the community to collect all links to threads which contained the term impf (roughly equivalent to the search term “vaccin” in English texts) in at least one post. The second script visited these links, scraped the relevant information, and saved them in a structured format. The subsequent sections present some excerpts from these script. Please note that the data collection is not perfectly reproducible, because both the content and the structure of the online communities may change over time. We therefore did not aim to provide a fully reproducible script, but document our basic scraping functions.
3.1 Urbia
3.1.1 Search
The search script for Urbia used phantomJS to render the search results. The first R function creates a java script call, which is then called from within R (inspired by this source).
create_js = function(search_term = "impf", page_number, file_name = "tmp.js") {
txt = paste0("
var webPage = require('webpage');
var page = webPage.create();
var fs = require('fs');
var path = 'urbia_search.html'
page.open('https://m.urbia.de/services/search?q=",
search_term, "#q=", search_term, "&label=forum&sort=date&board=&start=",
page_number, "', function (status) {
var content = page.content;
fs.write(path,content,'w')
phantom.exit();
});
")
write_file(txt, file_name)
}The search function is called once to identify the number of result pages.
create_js(page_number = "1")
system("phantomjs-2.1.1-macosx/bin/phantomjs tmp.js")
n_hits = read_html("urbia_search.html") %>% html_nodes("#results-amount") %>%
html_text() %>% str_remove_all(" Ergebnisse") %>% as.integer()
n_pages = ceiling(n_hits/10)
cat(paste0("There are ", n_pages, " pages."))The links are collected from the first result page. We then loop over the remaining result pages and collect the links. All links were written to a text file.
# Links from first page
links = read_html("urbia_search.html") %>% html_nodes("h2") %>% .[2:11] %>%
html_nodes("a") %>% html_attr("href")
write_lines(links, "urbia_links.txt")
# Links from later pages
for (i in 1:(n_pages - 1)) {
create_js(page_number = paste0(i, "1"))
system("phantomjs-2.1.1-macosx/bin/phantomjs tmp.js")
new_links = read_html("urbia_search.html") %>% html_nodes("h2") %>% .[2:11] %>%
html_nodes("a") %>% html_attr("href")
links = c(links, new_links)
write_lines(links, "links.txt")
}3.1.2 Collection
Urbia requires a login to access older posts. Username and password have to be set before the web scrape (inspired by this source). Note that we used the mobile version of the website (m.urbia.de).
# Address of the login webpage
login = "https://m.urbia.de/login?go=%2Flogin%3Fgo%3D"
# create a web session with the desired login address
pgsession = html_session(login)
pgform = html_form(pgsession)[[2]] #in this case the submit is the 2nd form
filled_form = set_values(pgform, nick = "XXX", pw = "YYY")
submit_form(pgsession, filled_form)Function to collect information on a thread.
get_thread = function(link, wait_time = 0, verbose = FALSE) {
if (verbose)
print(link)
txt = read_html(jump_to(pgsession, paste0("https://m.urbia.de", link)))
full_link = txt %>% html_nodes("link") %>% .[1] %>% html_attr("href") %>%
str_replace("https://www", "https://m")
thread_title = txt %>% html_nodes("main h1") %>% html_text() %>% str_squish()
subforum = txt %>% html_nodes(".main") %>% html_nodes("span") %>% .[2] %>%
html_text()
n_pages = txt %>% html_nodes("li.current") %>% .[1] %>% html_text() %>%
str_remove("1 von ") %>% as.integer()
if (length(n_pages) == 0)
n_pages = 1L
d_out = get_posts(text = txt)
if (n_pages > 1) {
for (i in 2:n_pages) {
if (verbose)
print(paste0(full_link, "?page=", i))
txt2 = read_html(jump_to(pgsession, paste0(full_link, "?page=",
i)))
d2 = get_posts(text = txt2)
d_out = bind_rows(d_out, d2)
}
}
if (wait_time > 0)
Sys.sleep(wait_time)
d_out %>% mutate(postnumber = 1:n(), thread_title = thread_title, subforum = subforum,
n_pages = n_pages, link = link, full_link = full_link)
}Function to collect information on a post (called from within the thread function).
get_posts = function(text) {
posts = text %>% html_nodes(".main") %>% html_nodes(".wrap-text") %>% html_text() %>%
str_squish()
author = text %>% html_nodes(".main") %>% html_nodes(".post-author") %>%
html_text() %>% str_remove(regex("\\([:print:]+")) %>% str_squish()
postdate = text %>% html_nodes(".main") %>% html_nodes(".post-date") %>%
html_text() %>% str_squish()
post_reply = text %>% html_nodes(".main") %>% html_nodes(".post-reply-to") %>%
html_text() %>% str_squish()
if (length(post_reply) < length(postdate)) {
post_reply = c(NA, post_reply)
}
stars = text %>% html_nodes(".main") %>% html_nodes("span") %>% str_subset("data-helpful-counter") %>%
str_sub(start = 28, end = -8) %>% as.integer()
if (length(stars) == 0)
stars = NA
tibble(posts = posts, author = author, postdate = postdate, post_reply = post_reply,
stars = stars)
}3.2 Rund ums Baby
3.2.1 Search
The Rund ums Baby discussion board can be searched with the following function.
get_links = function(text) {
div = text %>% html_nodes("div")
div2 = div %>% str_detect(fixed("style=\"margin:26px 0px 30px 6px;\"")) %>%
which() %>% max()
div[div2] %>% html_nodes("a") %>% html_attr("href")
}The search function is called once to identify the number of result pages and to collect the links from the first page.
txt = read_html("https://www.rund-ums-baby.de/suche/index.htm?suchbegriff=impf&kategorie=eltern&seite=1")
links = get_links(txt)
n_pages = txt %>% html_nodes(".suche_navigation") %>% html_nodes("a") %>% html_attr("href") %>%
str_remove(fixed("?suchbegriff=impf&kategorie=eltern&seite=")) %>% as.integer() %>%
max()We then loop over the remaining result pages and collect the links. All links were written to a text file.
3.2.2 Collection
Function to collect all information given the link to the thread.
get_thread = function(link, wait_time = 0, verbose = FALSE) {
if (verbose)
print(link)
txt = read_html(paste0("https://www.rund-ums-baby.de", link))
subforum = txt %>% html_nodes(".forum_titel") %>% html_text()
thread_title = txt %>% html_nodes("#content") %>% html_nodes("h1") %>% html_text()
first_post = txt %>% html_nodes("#content") %>% html_nodes("p.beitrag") %>%
html_text()
later_posts = txt %>% html_nodes("#content") %>% html_nodes("p.antwort") %>%
html_text()
posts = c(first_post, later_posts)
first_author = txt %>% html_nodes("#content") %>% html_nodes("td:contains(von)") %>%
html_nodes("b") %>% html_text()
later_authors = txt %>% html_nodes(".antwort_von_name") %>% html_text()
author = c(first_author, later_authors)
first_date = txt %>% html_nodes("#content") %>% html_nodes("td:contains(von)") %>%
html_text(trim = TRUE) %>% str_remove(fixed(first_author)) %>% str_remove("von") %>%
str_remove("am") %>% str_remove("Geschrieben") %>% str_squish()
later_dates = txt %>% html_nodes("p.antwort_von") %>% html_text()
if (length(later_dates) > 0) {
for (i in 1:length(later_dates)) {
later_dates[i] = str_remove(later_dates[i], fixed(paste0("Antwort von ",
later_authors[i], " am ")))
}
}
postdate = c(first_date, later_dates)
thread_replies = txt %>% html_nodes(".anzahl_antworten") %>% html_text() %>%
str_remove(" Antwort:") %>% str_remove(" Antworten:") %>% as.integer()
if (length(thread_replies) == 0)
thread_replies = NA_integer_
tibble(posts = posts, postdate = postdate, author = author, thread_replies = thread_replies,
thread_title = thread_title, subforum = subforum, link = link) %>% mutate(postnumber = 1:n())
}3.3 Netmoms
3.3.1 Search
The search results on Netmoms are presented with fixed paginations. Note that we access the static version of the website (static.netmoms.de).
txt = read_html("https://static.netmoms.de/search/discussion/impf")
result_pages = txt %>% html_nodes(".jumpToPage") %>% html_nodes("option") %>%
html_attr("value") %>% .[-1]The function extracts the links to the threads.
We apply the function to the first page and all subsequent pages. All links were written to a text file.
3.3.2 Collection
Function to collect information on a thread.
get_thread = function(link, wait_time = 0, verbose = FALSE) {
if (verbose)
print(link)
txt = read_html(paste0("https://www.netmoms.de", link))
next_replies = txt %>% html_nodes(".jumpToPage") %>% html_nodes("option") %>%
html_attr("value") %>% unique() %>% .[-1]
subforum = txt %>% html_nodes("h1") %>% .[1] %>% html_text()
thread_title = txt %>% html_nodes("h1") %>% .[2] %>% html_text(trim = TRUE)
thread_visits = txt %>% html_nodes(".tooltip-info") %>% html_text(trim = TRUE) %>%
as.integer() %>% .[!is.na(.)] %>% .[1]
thread_replies = txt %>% html_nodes(".tooltip-info") %>% html_text(trim = TRUE) %>%
as.integer() %>% .[!is.na(.)] %>% .[2]
d_out = get_posts(text = txt)
if (length(next_replies) > 0) {
d_out2 = next_replies %>% map_df(~get_posts(text = read_html(.x)))
} else {
d_out2 = NULL
}
d_out %>% bind_rows(d_out2) %>% mutate(postnumber = 1:n(), subforum = subforum,
thread_title = thread_title, thread_visits = thread_visits, thread_replies = thread_replies,
n_pages = length(next_replies) + 1, link = link)
}Function to collect information on a post (called from within the thread function).
get_posts = function(text) {
first_post = text %>% html_nodes(".first-post") %>% html_nodes(".bodytext") %>%
html_text(trim = TRUE)
if (length(first_post) == 0)
first_post = NA_character_
later_posts = text %>% html_nodes(".activitiy-item") %>% html_nodes(".bodytext") %>%
html_text(trim = TRUE)
if (length(later_posts) == 0)
later_posts = NA_character_
posts = c(first_post, later_posts)
author = text %>% html_nodes(".owner-info") %>% map(~html_nodes(.x, ".link-to-user")) %>%
map_chr(~ifelse(length(.x) == 0, "Gelöschte NetMom", html_text(.x)))
postdate = text %>% html_nodes(".sl-0") %>% html_text()
expert = text %>% html_nodes(".owner-info") %>% html_node(".status-expert") %>%
as.character() %>% str_detect("expert")
tibble(posts = posts, author = author, postdate = postdate, expert = expert)
}3.4 Babycenter
Babycenter was by far the hardest platform for the data collection. The conversation style differed from the other communities, which were more similar to traditional discussion boards with topically organized threads. The discussions on Babycenter were more similar to a group chat in which multiple topics were discussed in the same thread. This resulted in much more threads with the search term and in much longer threads. Many posts were irrelevant for our research interest and had to be sorted out later on (see Section 5).
3.4.1 Search
We read the first result page. We extract the links from the first page and the number of search results. We calculate the number of result pages from the latter.
txt = read_html("https://www.babycenter.de/search?q=impf&filterQuery=content_bucket:community_comment")
int_links = txt %>% html_nodes(".searchLink") %>% html_attr("href")
links = int_links %>% map_chr(~get_canonicallink(link = .x))
n_results = txt %>% html_nodes("h1") %>% html_node("strong") %>% html_text() %>%
str_remove(fixed(".")) %>% as.integer()
n_pages = ceiling(n_results/10)The function extracts the links to the threads. The safe version prevents an abortion of the search loop in case of an error.
get_canonicallink = function(link) {
canlink = read_html(paste0("https://www.babycenter.de", link)) %>% html_nodes("head") %>%
html_nodes("link[rel=\"canonical\"]") %>% html_attr("href")
if (str_detect(canlink, fixed("?"))) {
canlink = canlink %>% str_sub(end = str_locate(., fixed("?"))[, 1] -
1)
}
canlink
}
safe_get_canonicallink = safely(.f = get_canonicallink, otherwise = NA_character_)We loop over all subsequent result pages. All links were written to a text file.
for (i in 1:(n_pages - 1)) {
print(i)
txt2 = read_html(paste0("https://www.babycenter.de/search?startIndex=",
i, "0&q=impf&filterQuery=content_bucket:community_comment"))
new_int_links = txt2 %>% html_nodes(".searchLink") %>% html_attr("href")
new_links = new_int_links %>% map_chr(~safe_get_canonicallink(link = .x)$result)
int_links = c(int_links, new_int_links)
links = c(links, new_links)
Sys.sleep(1)
}
write_lines(links, "bc_links.txt")3.4.2 Collection
Function to collect information on a thread.
get_thread = function(link, wait_time = 0, verbose = FALSE) {
if (verbose)
print(link)
txt = read_html(link)
n_pages = txt %>% html_nodes(".firstLast") %>% html_nodes("a") %>% html_attr("href") %>%
str_sub(start = str_locate(., fixed("startIndex="))[, 2] + 1) %>% as.integer() %>%
I(.)/10
if (length(n_pages) == 0)
n_pages = NA_integer_
thread_title = txt %>% html_nodes("h1") %>% html_text(trim = TRUE)
subforum = txt %>% html_nodes("#groupLink") %>% html_text()
posts = get_posts(txt)
if (!is.na(n_pages)) {
for (i in 1:n_pages) {
if (verbose)
print(i)
txt2 = read_html(paste0(link, "?startIndex=", i, "0"))
posts2 = get_posts(txt2)
posts = bind_rows(posts, posts2)
Sys.sleep(wait_time)
}
}
posts %>% mutate(postnumer = 1:n(), thread_title = thread_title, subforum = subforum,
n_pages = n_pages, link = link)
}Function to collect information on a post (called from within the thread function).
get_posts = function(text) {
posts = text %>% html_nodes(".postContent") %>% html_nodes(".bodyText") %>%
html_text(trim = TRUE)
first_author = text %>% html_nodes(".postInfo") %>% .[1] %>% html_text(trim = TRUE) %>%
str_split("[:space:]") %>% .[[1]] %>% .[1]
later_authors = text %>% html_nodes(".postInfo") %>% html_nodes(".commentCreator") %>%
html_text(trim = TRUE) %>% str_remove(fixed("Ursprünglicher Verfasser")) %>%
str_squish()
authors = c(first_author, later_authors)
postdate = text %>% html_nodes(".postInfo") %>% .[1:length(authors)] %>%
html_text(trim = TRUE) %>% str_remove(fixed("Ursprünglicher Verfasser")) %>%
str_squish() %>% str_split("[:space:]", simplify = TRUE) %>% .[, 2]
likes = text %>% html_nodes(".likeCount") %>% html_text() %>% as.integer()
tibble(posts = posts, postdate = postdate, authors = authors, likes = likes)
}