Diff of /main.R [000000] .. [087906]

Switch to unified view

a b/main.R
1
Get_PubMed_Data <- function(topic, start_date, end_date, return_count) {
2
  require(RISmed)
3
  
4
  search_query <- EUtilsSummary(topic, retmax=return_count, mindate=start_date,maxdate=end_date)
5
  summary(search_query)
6
  # see the ids of our returned query
7
  QueryId(search_query)
8
  # get actual data from PubMed
9
  records<- EUtilsGet(search_query)
10
  class(records)
11
  # store it
12
  pubmed_data <- data.frame('Title'=ArticleTitle(records),'Abstract'=AbstractText(records))
13
  head(pubmed_data,1)
14
  pubmed_data$Title <- as.character(pubmed_data$Title)
15
  pubmed_data$Abstract <- as.character(pubmed_data$Abstract)
16
  pubmed_data$Abstract <- gsub(",", " ", pubmed_data$Abstract, fixed = TRUE)
17
  
18
  return (pubmed_data)
19
}
20
21
medical_corpus <- Get_PubMed_Data('cardiology', 2013, 2015, 1500)
22
dim(medical_corpus)
23
head(medical_corpus)
24
25
Text_To_Clean_Sentences <- function(text_blob) {
26
  # swap all sentence ends with code 'ootoo'
27
  text_blob <- gsub(pattern=';|\\.|!|\\?', x=text_blob, replacement='ootoo')
28
  
29
  # remove all non-alpha text (numbers etc)
30
  text_blob <- gsub(pattern="[^[:alpha:]]", x=text_blob, replacement = ' ')
31
  
32
  # force all characters to lower case
33
  text_blob <- tolower(text_blob)
34
  
35
  # remove any small words {size} or {min,max}
36
  text_blob <- gsub(pattern="\\W*\\b\\w{1,2}\\b", x=text_blob, replacement=' ')
37
  
38
  # remove contiguous spaces
39
  text_blob <- gsub(pattern="\\s+", x=text_blob, replacement=' ')
40
  
41
  # split sentences by split code
42
  sentence_vector <- unlist(strsplit(x=text_blob, split='ootoo',fixed = TRUE))
43
  return (sentence_vector)
44
}
45
46
corpus_sentences <- Text_To_Clean_Sentences(paste(medical_corpus$Abstract, collapse=" "))
47
48
49
test_sentence <- "this is a big sentence"
50
51
library(ngram)
52
ng_2 <- ngram(test_sentence , n=2)
53
print(ng_2)
54
55
Trim <- function( x ) {
56
  gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)
57
}
58
59
60
Get_Ngrams <- function(sentence_splits, ngram_size=2) {
61
  ngrams <- c()
62
  for (sentence in sentence_splits) {
63
    sentence <- Trim(sentence)
64
    if ((nchar(sentence) > 0) && (sapply(gregexpr("\\W+", sentence), length) >= ngram_size)) {
65
      ngs <- ngram(sentence , n=ngram_size)
66
      ngrams <- c(ngrams, get.ngrams(ngs))
67
    }
68
  }
69
  return (ngrams)
70
}
71
72
n2 <- Get_Ngrams(corpus_sentences, ngram_size=2)
73
n3 <- Get_Ngrams(corpus_sentences, ngram_size=3)
74
n4 <- Get_Ngrams(corpus_sentences, ngram_size=4)
75
n5 <- Get_Ngrams(corpus_sentences, ngram_size=5)
76
77
# consolidate all n-gram vectors into one
78
n_all <- c(n2, n3, n4, n5)
79
80
# save the n-grams in the same folder as your shiny code
81
write.csv(n_all, "c:\\cordova\\pubmed_cardiology_ngrams.csv", row.names=FALSE)
82
83
head(n_all)
84
length(n_all)
85
86
# notice the trailing space at end to avoid picking last word
87
word <- 'infection '
88
89
matches <- c()
90
for (sentence in n_all) {
91
  # find exact match with double backslash and escape
92
  if (grepl(paste0('\\<',word), sentence)) {
93
    print(sentence)
94
    matches <- c(matches, sentence)
95
  }
96
}
97
98
# find highest probability word
99
precision_match <- c()
100
for (a_match in matches) {
101
  # how many spaces in from of search word
102
  precision_match <- c(precision_match,nchar(strsplit(x = a_match, split = word)[[1]][[1]]))
103
}
104
105
# use highest number and a random of highest for multiples
106
best_matched_sentence <- sample(matches[precision_match == max(precision_match)],size = 1)
107
108
print(best_matched_sentence)
109
110
# notice the trailing space at end to avoid picking last word
111
word <- 'infection '
112
113
matches <- c()
114
for (sentence in n_all) {
115
  # find exact match with double backslash and escape
116
  if (grepl(paste0('\\<',word), sentence)) {
117
    print(sentence)
118
    matches <- c(matches, sentence)
119
  }
120
}
121
122
# find highest probability word
123
precision_match <- c()
124
for (a_match in matches) {
125
  # how many spaces in from of search word
126
  precision_match <- c(precision_match,nchar(strsplit(x = a_match, split = word)[[1]][[1]]))
127
}
128
129
# use highest number and a random of highest for multiples
130
best_matched_sentence <- sample(matches[precision_match == max(precision_match)],size = 1)
131
132
print(best_matched_sentence)
133
134
# split the best matching sentence by the search word
135
best_match <- strsplit(x = best_matched_sentence, split = word)[[1]]
136
# split second part by spaces and pick first word
137
best_match <-  strsplit(x = best_match[[2]], split = " ")[[1]]
138
best_match <- best_match[[1]]
139
140
print(best_match)