#############
# CONSTANTS #
#############
library("tm")
work_directory = "~/Desktop"
fns_file <- "P3_GrantExport_with_abstracts.csv"
fns_url <- "http://p3.snf.ch/P3Export/P3_GrantExport_with_abstracts.csv"
begin_date <- "1.1.2007"


if (!file.exists(work_directory)){
  dir.create(work_directory)
}
setwd(work_directory)

####################
# HELPER FUNCTIONS #
####################


devide_by_lang <- function(fns_data){
  n <- nrow(fns_data)
  answer = "N"
  if (file.exists("en")){
    while(!answer %in% c("Y","N")){
      answer <- readline(prompt="Do you want to redevide texts by language? Y/N \n")
    }
    if (answer == "N")  {
      fns_data[,"Abstract.Language"] <- " "
      fns_data[which(fns_data$Project.Number %in% sub(".txt", "", list.files("en"))),"Abstract.Language"] <- "en"
      fns_data[which(fns_data$Project.Number %in% sub(".txt", "", list.files("fr"))),"Abstract.Language"] <- "fr"
      fns_data[which(fns_data$Project.Number %in% sub(".txt", "", list.files("it"))),"Abstract.Language"] <- "it"
      fns_data[which(fns_data$Project.Number %in% sub(".txt", "", list.files("de"))),"Abstract.Language"] <- "de"
      fns_data[which(fns_data$Project.Number %in% sub(".txt", "", list.files("other"))),"Abstract.Language"] <- "other"
      return(fns_data);
    }
  }
  
  
  stopwords_in_text <- function(text, stopwords){
    #this function counts occurences of stopwords in a text
    count = 0
    for (stopword in stopwords) {
      count <- count + length(grep(paste("\\b",stopword, "\\b"), text, ignore.case = TRUE))
    }
    return(count)
  }
  
  
  # recreated directories
  unlink("fr", recursive=TRUE)
  unlink("de", recursive=TRUE)
  unlink("it", recursive=TRUE)
  unlink("en", recursive=TRUE)
  unlink("other", recursive=TRUE)
  dir.create("fr")
  dir.create("de")
  dir.create("it")
  dir.create("en")
  dir.create("other")
  
  # get abstract language and save abstract file in lang dir
  
  fns_data[,"Abstract.Language"] <- " "
  for(i in c(1:n)){
    abstract <- fns_data[i,"Abstract"]
    languages = list("fr"=stopwords_in_text(abstract, stopwords(kind="fr")),"de"=stopwords_in_text(abstract, stopwords(kind="de")), "en"=stopwords_in_text(abstract, stopwords(kind="en")),"it"=stopwords_in_text(abstract, stopwords(kind="it")))
    lang = names(which(languages == max(unlist(languages))))
    
    if (length(lang) != 1){
      lang = "other"
    }
    fns_data[i,"Abstract.Language"] <- lang
    filename <- paste(lang,"/",fns_data[i,"Project.Number"],".txt",sep="")
    write.table(abstract, file=filename, append = FALSE, col.names = FALSE, row.names = FALSE, quote = FALSE)
  }
  return(fns_data)
}



dataframe2iramuteq <- function(filename, main_text_column, var_cols, dataframe){
  sink(filename)
  for (i in seq(nrow(dataframe))){
    variables = ""
    for (v in var_cols){
      variables = paste(variables, "*", gsub("[^[:alnum:]]","",v), "_", gsub("[^[:alnum:]]","",dataframe[i,v])," ", sep="" )
    }
    cat(paste("****", variables, "\n\n"))-
      cat(dataframe[i,main_text_column], "\n\n")
    variables = ""
  }
  sink()
}



##############
# BEGIN CODE #
##############


# if file not in working directory download it.
if ( !file.exists(fns_file) ){ download.file(fns_url, fns_file) }
# import file
fns_data <- read.csv(fns_file, header=TRUE, stringsAsFactors=FALSE, sep = ";", quote = "\"", dec = ".", fill = TRUE)
colnames(fns_data) <- c("Project.Number", "Project.Title", "Project.Title.English", "Responsible.Applicant", "Funding.Instrument", "Funding.Instrument.Hierarchy", "Institution", "University", "Discipline.Number", "Discipline.Name", "Discipline.Name.Hierarchy", "All.disciplines", "Start.Date", "End.Date", "Approved.Amount", "Keywords", "Abstract", "Lay.Summary.Lead.English", "Lay.Summary.English", "Lay.Summary..Lead.German", "Lay.Summary.German", "Lay.Summary.Lead.French", "Lay.Summary.French", "Lay.Summary.Lead.Italian", "Lay.Summary.Italian", "lang")

str(fns_data)
#fix types
fns_data[,"Start.Date"] <- as.Date(fns_data$Start.Date, "%d.%m.%Y")
fns_data[,"End.Date"] <- as.Date(fns_data$End.Date, "%d.%m.%Y")

print(paste("Imported", nrow(fns_data), "projects"))

# remove unused
print(paste("Eliminating", length(which(fns_data$Abstract == "")), "projects without abstract"))
fns_data <- fns_data[-which(fns_data$Abstract == ""),]

fns_data <- devide_by_lang(fns_data)

print(paste("Eliminating", length(which(fns_data[, "Start.Date"] < as.Date(begin_date, "%d.%m.%Y"))), "projects started before", begin_date))
fns_data <- fns_data[-which(fns_data[, "Start.Date"] < as.Date(begin_date, "%d.%m.%Y")),]

print(paste("Keeping only frech Abstacts"))
fns_data <- fns_data[which(fns_data[, "Abstract.Language"] == 'fr'),]
  
n = nrow(fns_data)
print(paste(n, "projects left"))

write.csv(fns_data, file = "FNS_abstracts_with_language_form_2012.csv")

colnames(fns_data) <- c("projectnumber", "projecttitle", "projecttitleen", "repsonsible", "fundinginstrument", "fundinginstrumenthierarchy", "institution", "university", "disciplinenumber", "disciplinename", "disciplinenamehierarchy", "alldisciplines", "startdate", "enddate", "approvedamount", "keywords", "abstract", "Lay.Summary.Lead.English", "Lay.Summary.English", "Lay.Summary..Lead.German", "Lay.Summary.German", "Lay.Summary.Lead.French", "Lay.Summary.French", "Lay.Summary.Lead.Italian", "Lay.Summary.Italian")
fns_data[which(fns_data$approvedamount == "data not included in P3"),"approvedamount"] <- NaN
fns_data <- transform(fns_data, approvedamount = as.numeric(approvedamount))
q <- quantile(fns_data$approvedamount, na.rm=TRUE)

fns_data$amountcategory <- cut(fns_data$approvedamount, q, labels=c("1q","2q","3q","4q"))

##### ICI CHOISIR LES VARIABLES PARMI CI-DESSUS 
#OLD cols = c( "disciplinenumber", "amountcategory", "fundinginstrument", "institution", "university", "disciplinename")

cols = c("amountcategory", "fundinginstrument", "university", "disciplinename")
dataframe2iramuteq("fns_fr_depuis_2007_reduit.txt", "abstract", cols, fns_data)
