Processing facebook posts

### Workflow for processing Facebook posts extracted with getPage() function###

### The code was developed as part of the Ancient Identities in Modern Britain (IARH) project
### Part 1 imports the Facebook posts saved in multiple files and puts them in the list.
### Part 2 extracts only those posts that have relevant keywords and identifies which language they are written in.
### Part 3 obtains the summary of which keywords appear in the posts and how frequently and which languages the posts are written in
### Part 4 separates posts by language
### Part 5 verifies whether posts containing keywords are actually relevant
### Project: Ancient Identities in Modern Britain (IARH); ancientidentities.org
### Author: Marta Krzyzanska


# Set working directory
setwd("~path")

# Require "textcat" and "data.table" packages. If you have not installed them already, run install.packages() first.

library("textcat")
library("data.table")
#Require functions:

#searchPosts() - code available here: https://github.com/IARHeritages/Heritage-of-Brexit/blob/master/searchPosts()
#requestVerification() and verifyText() - code available here: https://github.com/IARHeritages/Heritage-of-Brexit/blob/master/verifyText()%20(and%20requestVerification())

#Requires defining or importing keywords - code for importing keywords available here:
#https://github.com/IARHeritages/Heritage-of-Brexit/blob/master/Importing%20and%20redefining%20keywords%20to%20search%20facebook%20posts

####Part 1: Load all the extracted posts and save them in the list, save the list put together.

#In this case we had 74 extracted files

listPosts<-c()

i=1

while (i<75){

load(paste("my_posts(",i,").R",sep=""))
listPosts[[i]] <- my_posts
i=i+1 }

#Save the list of lists of posts as one file

save(listPosts,file="listPosts.R")

######Part 2: Extract only the data with the relevant keywords and detect the language of the posts 

i=1
j=length(listPosts)+1

	while(i<j){
		print (paste("Searching keywords in list",i,"out of",j,sep=" "))
		k=1
		l=length(listPosts[[i]])+1
		while(k<l){
			if(length(listPosts[[i]][[k]])>0){
				print (paste("page",k,sep=" "))
				listPosts[[i]][[k]]$keywords <-NA
				m=1
				n=length(listPosts[[i]][[k]]$message)+1
				while(m<n){
					listPosts[[i]][[k]]$keywords[m] <- searchPost(listPosts[[i]][[k]]$message[m],postKeywords)
					m=m+1
				}
				listPosts[[i]][[k]] <- subset(listPosts[[i]][[k]],listPosts[[i]][[k]]$keywords!="")
				listPosts[[i]][[k]]$language<-textcat(listPosts[[i]][[k]]$message)
				k=k+1
			}
			else{
				print (paste("page",k,sep=" "))
				k=k+1
			}
		}
		i=i+1
	}
	
####Part3: Get the relevant statistics

##1: Merge all the dataframes into one table (only for statistics purposes):

listPostsTable <- rbindlist(listPosts[[1]])
i=2
j= length(listPosts)+1
while(i<j){
	lpi <- rbindlist(listPosts[[i]])
	listPostsTable <- rbind(listPostsTable,lpi)
	i=i+1}

### Get all the unique languages and all the unique entries for the keywords

languages <- as.data.frame(table(listPostsTable$language))
languages <- languages[with(languages, order(-Freq)), ]

uniqueKeywords <- as.data.frame(table(listPostsTable$keyword))
uniqueKeywords <- uniqueKeywords[with(uniqueKeywords, order(-Freq)), ]

####Part 4 option 1: extract only english posts or separate posts by language

listEnglishPosts <- listPosts
i=1
j=length(listPosts)+1

	while(i<j){
		print (paste("Subsetting posts in list",i,"out of",(j-1),sep=" "))
		k=1
		l=length(listPosts[[i]])+1
		while(k<l){
			if(length(listPosts[[i]][[k]])>0){
				print (paste("page",k,sep=" "))
				listEnglishPosts[[i]][[k]] <- subset(listEnglishPosts[[i]][[k]],listEnglishPosts[[i]][[k]]$language=="english")
				k=k+1
			}
			else{
				print (paste("page",k,sep=" "))
				k=k+1
			}
		}
		i=i+1
	}
	
####Part 4 option 2: separate list of posts by language

postsByLanguage<- c()
y=1
z=length(languages[[1]])+1
while(y<z){
listPostsByLanguage <-c()
	i=1
	j=length(listPosts)+1
	while(i<j){
		postsLanguage <- c()
		print (paste("Subsetting posts in list",i,"out of",(j-1),sep=" "))
		k=1
		l=length(listPosts[[i]])+1
		while(k<l){
			if(length(listPosts[[i]][[k]])>0 && length(listPosts[[i]][[k]][[1]])>0){
				print (paste("page",k,sep=" "))
				postsLanguage[[k]] <- subset(listPosts[[i]][[k]],listPosts[[i]][[k]]$language==as.character(languages[[1]][[y]]))
				k=k+1
			}
			else{
				print (paste("page",k,sep=" "))
				k=k+1
			}
		}
	listPostsByLanguage[[i]] <- postsLanguage
		i=i+1
	}
postsByLanguage[[y]] <- listPostsByLanguage
y=y+1
}


####Part 5: Verify the relevance of the posts

#Define or import keywords that guarantee the post to be relevant
confirmedKeywords <- c(#Enter keywords)
#Define empty dataframe for dataframes that have no posts in them
emptyDF <- listEnglishPosts[[3]][[1]]

#Mark the entries containing keywords that guarantee that the post is relevant as verified, and ask the user for the manual varification
for the rest of them.

i=3
j=length(listEnglishPosts)+1

	while(i<j){
		print (paste("Subsetting posts in list",i,"out of",(j-1),sep=" "))
		k=1
		l=length(listEnglishPosts[[i]])+1
		while(k<l){
			if(length(listEnglishPosts[[i]][[k]])>0 && length(listEnglishPosts[[i]][[k]][[1]])>0){
				print (paste("page",k,sep=" "))
				listEnglishPosts[[i]][[k]]$verified <- "NA"
				m=1
				n=length(listEnglishPosts[[i]][[k]]$message)+1
				while(m<n){
					o=1
					p=length(confirmedKeywords)+1
					while (o<p){
						y <- grepl(confirmedKeywords[o], listEnglishPosts[[i]][[k]]$message[m],ignore.case = TRUE)
						if(y==TRUE){
							listEnglishPosts[[i]][[k]]$verified[m] <- "yes"
						o=o+1}
						else{
						o=o+1}
					}
					if(listEnglishPosts[[i]][[k]]$verified[m] == "NA"){
						if(verifyText(listEnglishPosts[[i]][[k]]$message[m],listEnglishPosts[[i]][[k]]$keyword[m])==T){
							listEnglishPosts[[i]][[k]]$verified[m] <- "yes"
						}
						else{
						listEnglishPosts[[i]][[k]]$verified <- "no"
						}
					}
					m=m+1
				}
				k=k+1
				print (k)
			}	
			else{
				listEnglishPosts[[i]][[k]] <- emptyDF
				print (paste("page",k,sep=" "))
				k=k+1
				print(k)
			}
		}
		i=i+1
	}