# OVERVIEW OF THE DATA RETRIEVAL PROCESS
#
# Initially we download the metadata for the journal's items we are interested into (see step1). Data is saved in '<journal.issn>.journal.items.rds'
# Then we download the metadata on their citing items, i.e. all items that cite the papers of the journals we are interested into (see step2). Data is saved in '<journal.issn>.citing.items.rds'
# In the above steps, all data are downloaded from Scopus in the STANDARD format, which is fast, but does not contain the full metadata records, i.e. it omits several fields. Thus, in step 3.1 and 3.2, we download the COMPLETE records of journal items we are interested into. The data is saved in 'all.journal.items.data.complete.rds', which contains the complete information of all the items of all the journals we are intersted into.
#Now, we have the required information to extact the scopus authors id and save them to 'all.journal.items.author_ids.rds' (step 3.3) and then we download the STANDARD records of all publications of each of those authors (step 3.4) and save it to 'all.journal.items.authors.items.data.rds'. In step 3.5, we use this data to create the history of each author's publications and save it to 'all.journal.items.author.history.melted.rds'.




# .................................................................................................
# Libraries ----
# .................................................................................................

library(rscopus)
library(data.table)


set_api_key("<WRITE YOUR SCOPUS API KEY>") #this should be set to a valid sopus API key




# .................................................................................................
# Journals to be downloaded ----
# .................................................................................................
folder.data="data" #you neeed to create a subfolder data

journal.issns = c(
  "0160-7383","1045-3172","0007-1315",
  "0007-2303","1045-2354","0263-7758",
  "0014-2921","0960-6491","1057-7408",
  "0304-3878","0022-1031","0167-6296",
  "0022-2186","0022-2879","0963-1798",
  "0894-3796","1053-1858","0743-0167",
  "0001-8791","0960-1627","0034-3404",
  "0034-6535","0261-5177","0305-750X",
  "0004-3702","0021-9398","0090-5364",
  "0096-3445","1082-989X","0899-8256",
  "0022-1996","0734-306X","0092-0703",
  "0272-4987","0741-6261","0092-5853",
  "0003-0554","0003-066X","0025-5610",
  "0038-4038"
)



journal.issn = journal.issns[1]  #the rest of the code refers to the first journal, i.e. "0160-7383"



# .................................................................................................
# STEP 1, download journal items ----
# .................................................................................................




# Data structures:
# .........................................
#  journal.items: A list of the journal items for which the citing items will be sought
#     $df$df: the data frame of the items
#     $df$affiliations: a data frame with the affiliations of the items
#     $entries: the list returned by ythe rscopus package
#     $
# 
# saved in data/<ISSN>.journal.items.csv and in <ISSN>.journal.items.rds

journal.items = scopus_search(query = paste0("issn(",journal.issn,") AND PUBYEAR > 1992 AND PUBYEAR < 2004"),
                              view = "COMPLETE",
                              verbose = T)

journal.items$df = gen_entries_to_df(journal.items$entries)

saveRDS(journal.items,file=paste0(folder.data,"/",journal.issn,".journal.items.rds"))

write.csv(
  journal.items$df$df,
  file = paste0(folder.data,"/",journal.issn,"_journal_items.csv"),
  row.names = F
)



# .................................................................................................
# STEP 2, download citing items ----
# .................................................................................................


# Data structures:
# .........................................
#  citing.items.list: A list with all the data downloaded for citing-items, as returned by rscopus.
#                     The Key is the journal.item$eid
#
#  rscopus data is returned with a STANDARD view.
#
#
#  saved in <ISSN>.citing.items.rds
#
# .........................................
# citing.items: Affiliation correspondence for each citing-item. One citing item may have multiple affiliations
#   data.table(journal.eid=character(),citing.eid=character(),country=character(),year=numeric())
#
#   saved in <ISSN>.citing.items.affil.csv
#


#................................................................................................
# Loop journal items eid and download citations 

eids = journal.items$df$df$eid

citing.items.list=list();

pb <- txtProgressBar(1, length(eids), style=3); i=0;
setTxtProgressBar(pb, i);

for(eid.cur in eids) {
  
  citing.items.current.scopus = tryCatch({
    scopus_search(query = paste0("REFEID(",eid.cur,")"),
                  view = "STANDARD", verbose = F)},
    error = function(e) {
      warning(paste0("Could not download references for ",eid.cur))
      warning(paste0("Error message: ",e))
      return(NULL);
    })
  
  if(!is.null(citing.items.current.scopus)) {
    citing.items.list[eid.cur] = citing.items.current.scopus
  }
  
  #information will be extracted by extract_affiliations function
  
  Sys.sleep(.2)
  i=i+1; setTxtProgressBar(pb, i)
  
}

close(pb)




#................................................................................................
# Save downloaded citing-items 


saveRDS(citing.items.list,
        file = paste0(folder.data,"/",journal.issn,".citing.items.rds")
)








# .................................................................................................
# STEP 3, download author metadata ----
# .................................................................................................


# STEP 3.1, Extract EIDs for journal.items and citing.items ----
# .................................................................................................

citing.eids=character();
journal.eids=character();
citation.count = 0; journal.items.count=0;

#load all eids and find unique


print(paste0("Reading ISSN: ", journal.issn))

if(file.exists(paste0(folder.data,"/",journal.issn,".journal.items.rds"))) {
  journal.items.cur =  readRDS(file = paste0(folder.data,"/",journal.issn,".journal.items.rds"));
  journal.eids=c(journal.eids,journal.items.cur$df$df$eid)
  journal.items.count = journal.items.count + length(journal.items.cur$df$df$eid)
} else {
  print(" .... journal not found")
}


if(file.exists(paste0(folder.data,"/",journal.issn,".citing.items.eids.rds"))) {
  
  eids.cur =  readRDS(file = paste0(folder.data,"/",journal.issn,".citing.items.eids.rds"));
  citation.count = citation.count + length(eids.cur)
  citing.eids = unique(c(citing.eids,eids.cur))
} else {
  print(" .... citing eids not found")
}



print(paste0("Total number of journal items included: ", journal.items.count))
print(paste0("Total number of citations included: ", citation.count))


#save data
saveRDS(citing.eids,file=paste0(folder.data,"/","all.journals.citing.items.eids.rds"))
saveRDS(journal.eids,file=paste0(folder.data,"/","all.journals.items.eids.rds"))




# STEP 3.2, Download COMPLETE records of journal items in bunches ----
# .................................................................................................


from = 1; to = length(journal.eids); step = 1; 
journal.items.data.complete = list(
  entries=list(),
  status_codes=c());
cur.query.num=1
for(eid.num in seq(from,to,step)) {
  eid.sel.nums = eid.num:min((eid.num+(step-1)),to)
  eid.sel.query.each = paste0("EID(",journal.eids[eid.sel.nums],")")
  eid.sel.query=(paste0(eid.sel.query.each, collapse = " OR "))
  
  print(paste0("Doing ", eid.num, " out of ", to))
  eids.data.cur = scopus_search(query = eid.sel.query,
                                view = "COMPLETE",
                                verbose = F)
  
  journal.items.data.complete[["status_codes"]][cur.query.num] = eids.data.cur$get_statements[["status_code"]]
  journal.items.data.complete[["entries"]][[cur.query.num]] = eids.data.cur$entries
  
  cur.query.num=cur.query.num+1;
  Sys.sleep(.25)
  if(cur.query.num%%200==0) {saveRDS(journal.items.data.complete,file=paste0(folder.data,"/all.journal.items.data.complete.rds"));}
}


saveRDS(journal.items.data.complete,file=paste0(folder.data,"/all.journal.items.data.complete.rds"))





# STEP 3.3, Extract author information from the COMPLETE records of journal items ----
# .................................................................................................
journal.items.author_ids = data.table(item.eid=character(), author_id=character(), author_name=character());

pb <- txtProgressBar(1, length(journal.items.data.complete$entries), style=3); i=0;

for(retrieval.num in 1:length(journal.items.data.complete$entries)) {
  
  for(item.num in 1:length(journal.items.data.complete$entries[[retrieval.num]])) {
    
    if(! "author" %in% names(journal.items.data.complete$entries[[retrieval.num]][[item.num]])) next;
    
    author.count = length(journal.items.data.complete$entries[[retrieval.num]][[item.num]]$author)
    
    for(author.num in 1:author.count) {
      
      journal.items.author_ids = rbind(journal.items.author_ids,
                                       data.table(
                                         item.eid=journal.items.data.complete$entries[[retrieval.num]][[item.num]]$eid, 
                                         author_id=journal.items.data.complete$entries[[retrieval.num]][[item.num]]$author[[author.num]]$authid, 
                                         author_name=journal.items.data.complete$entries[[retrieval.num]][[item.num]]$author[[author.num]]$authname
                                       )
      )
    }
    
  }
  setTxtProgressBar(pb, retrieval.num)
}


saveRDS(journal.items.author_ids,file=paste0(folder.data,"/all.journal.items.author_ids.rds"))
write.table(journal.items.author_ids, file=paste0(folder.data,"/journal.items.author_ids.csv"), sep="\t")




# STEP 3.4, Download COMPLETE records of citing.items in bunches ----
# .................................................................................................

citing.eids=names(citing.items.list)

#for(eid.num in seq(1,length(citing.eids),10)) {
from = 1; to = length(citing.eids); step = 10; 
citing.items.data.complete = list(
  entries=list(),
  status_codes=c());

cur.query.num=1
for(eid.num in seq(from,to,step)) {
  eid.sel.nums = eid.num:min((eid.num+(step-1)),to)
  eid.sel.query.each = paste0("EID(",citing.eids[eid.sel.nums],")")
  eid.sel.query=(paste0(eid.sel.query.each, collapse = " OR "))
  
  print(paste0("Doing ", eid.num, " out of ", to))
  eids.data.cur = scopus_search(query = eid.sel.query,
                                view = "COMPLETE",
                                verbose = F)
  
  citing.items.data.complete[["status_codes"]][cur.query.num] = eids.data.cur$get_statements[["status_code"]]
  citing.items.data.complete[["entries"]][[cur.query.num]] = eids.data.cur$entries
  
  cur.query.num=cur.query.num+1;
  if(cur.query.num%%100==0) {saveRDS(citing.items.data.complete,file=paste0(folder.data,"/all.selected.citing.eids.data.rds"));}
}


saveRDS(citing.items.data.complete,file=paste0(folder.data,"/all.selected.citing.eids.data.rds"))




# STEP 3.5, Retrieve author information from citing.items ----
# .................................................................................................


citing.items.author_ids = data.table(citing.item.eid=character(), author_id=character(), author_name=character());

pb <- txtProgressBar(1, length(citing.items.data.complete$entries), style=3);

for(retrieval.num in 1:length(citing.items.data.complete$entries)) {
  
  entries = gen_entries_to_df(citing.items.data.complete$entries[[retrieval.num]])
  authors = unique(data.table(merge(entries$author[,c("authid","authname", "entry_number")], 
                                    entries$df[,c("eid","entry_number")],
                                    all.x=T,by="entry_number")))
  
  
  citing.items.author_ids = rbind(citing.items.author_ids,
                                  authors[,list(citing.item.eid=eid,author_id=authid,author_name=authname)]
                                  
  )
  
  
  setTxtProgressBar(pb, retrieval.num)
}


saveRDS(citing.items.author_ids,file=paste0(folder.data,"/all.citing.items.author_ids.rds"))
write.table(citing.items.author_ids,file=paste0(folder.data,"/all_citing_items_author_ids.csv"),sep = "\t",row.names = F)






# STEP 3.6, Download STANDARD records for each author ----
# .................................................................................................

journal.items.authid.unique = unique(journal.items.author_ids$author_id)

journal.items.authors.items = list()

cur.request = 1;
#pb <- txtProgressBar(1, length(journal.items.authid.unique), style=3); 

for(authid in journal.items.authid.unique) {
  
  
  
  auth.item.cur = scopus_search(query = paste0("AU-ID(",authid,")"),
                                view = "STANDARD", verbose = F)
  
  journal.items.authors.items[[authid]] = auth.item.cur$entries
  
  cur.request=cur.request+1
  if(cur.request%%5==0) {
    print(paste0(cur.request,"/",length(journal.items.authid.unique)))
  }
  if(cur.request%%500==0) {
    saveRDS(journal.items.authors.items,file=paste0(folder.data,"/all.journal.items.authors.items.data.rds"))
  }
  
  #setTxtProgressBar(pb, cur.request)
  
}

saveRDS(journal.items.authors.items,file=paste0(folder.data,"/all.journal.items.authors.items.data.rds"))






# STEP 3.7,  For authors, create history of publications, based on their STANDARD records----
# .................................................................................................


journal.items.authors.items=readRDS(file=paste0(folder.data,"/all.journal.items.authors.items.data.rds"))

author.history.melted = data.table(
  authid = character(), eid=character(), year = numeric(), type=character(), 
  `citedby-count`=numeric(), issn=character()
)

author.affiliation.melted = data.table(
  authid = character(), eid=character(), year = numeric()
  
)

cur_author_num=1;author_length=length(names(journal.items.authors.items))
for(authid.cur in names(journal.items.authors.items)) {
  
  print(paste0(cur_author_num, "/", author_length))
  
  df.cur = gen_entries_to_df(journal.items.authors.items[[authid.cur]])
  
  
  
  df.cur.df = data.table(df.cur$df)
  
  df.cur.affil = merge(
    data.table(df.cur$affiliation),
    df.cur.df[,list(entry_number,eid)]
  )
  
  author.history.melted = rbind(author.history.melted,
                                df.cur.df[,list(
                                  authid = authid.cur, 
                                  eid = df.cur$eid,
                                  year = as.numeric(substr(df.cur$`prism:coverDate`,1,4)), 
                                  type=df.cur$subtype, 
                                  `citedby-count`=df.cur$`citedby-count`,
                                  issn=`prism:issn`
                                )]
  )
  
  
  
  cur_author_num=cur_author_num+1
  
}

saveRDS(author.history.melted,file=paste0(folder.data,"/all.journal.items.author.history.melted.rds"))
write.table(author.history.melted, file=paste0(folder.data,"/all.journal.items.author.history.melted.csv"), sep="\t")


write.table()



# STEP 3.8, Retrieve affiliation information for citing.items ----
# .................................................................................................

citing.items=data.table(journal.eid=character(),citing.eid=character(),country=character(),affilname=character(),year=numeric(),
                        issn=character(),`cited-by-count`=numeric(), type=character() )

pb <- txtProgressBar(1, length(journal.eids), style=3); i=0;

citing.eids = names(citing.items.list)

for(eid.cur in citing.eids) {
  
  #print(paste0("Doing citing items of eid=",eid.cur))
  
  df.cur = gen_entries_to_df(citing.items.list[[eid.cur]])
  
  if(  ("eid" %in% colnames(df.cur$df)) & ("prism:issn" %in% colnames(df.cur$df))   ) {
    if(length(df.cur$affiliation)>0) {
      if(nrow(df.cur$affiliation)>0) {
        if("affiliation-country" %in% colnames(df.cur$affiliation)) {
          citing.items.current = data.table(merge(
            df.cur$df[,c("eid","prism:coverDate","entry_number","prism:issn","citedby-count","prism:aggregationType")],
            df.cur$affiliation[,c("entry_number","affiliation-country","affilname")],
            by="entry_number",
            all.y=T)
          )
          
          citing.items=rbind(citing.items,
                             citing.items.current[!is.na(entry_number),list(
                               journal.eid=eid.cur,
                               citing.eid=eid,
                               country=`affiliation-country`,
                               affilname,
                               year=as.numeric(substr(`prism:coverDate`,1,4)),
                               issn=`prism:issn`,
                               `cited-by-count`=`citedby-count`,
                               type=`prism:aggregationType`  
                             )]
          )
        } #if column "affiliation-country" exists
      } #if affiliation has rows
    } #end if affiliation is present
  } #if no error is returned
  i=i+1; setTxtProgressBar(pb, i)
} #loop eids

write.csv(
  citing.items,
  file = paste0(folder.data,"/",journal.issn,".citing.items.affil.csv"),
  row.names = F
)





