1 |
|
---|
2 | #include "text_t.h"
|
---|
3 | #include <ctype.h> // for isspace
|
---|
4 | #include <vector>
|
---|
5 |
|
---|
6 | void equiv_lookup(text_t& terms) {
|
---|
7 | // eventually we'll get these terms out of a config file or something...
|
---|
8 | vector<text_t> group;
|
---|
9 | vector< vector<text_t> > all_groups;
|
---|
10 |
|
---|
11 | group.push_back("triene");
|
---|
12 | group.push_back("trientine");
|
---|
13 | group.push_back("trien");
|
---|
14 | group.push_back("\"triethylenetetramine dihydrochloride\"");
|
---|
15 | group.push_back("\"triethylene tetramine dihydrochloride\"");
|
---|
16 | group.push_back("syprine");
|
---|
17 | group.push_back("teta");
|
---|
18 | group.push_back(text_t("\"trien 2hcl\""));
|
---|
19 | group.push_back(text_t("\"triethylenetetramine 2hcl\""));
|
---|
20 | group.push_back(text_t("\"bis aminoethyl ethanediamine dihydrochloride\""));
|
---|
21 | all_groups.push_back(group);
|
---|
22 |
|
---|
23 |
|
---|
24 | group.clear();
|
---|
25 | group.push_back("pkpd");
|
---|
26 | group.push_back("pkin");
|
---|
27 | group.push_back("pharmocokinetic");
|
---|
28 | group.push_back("pharmacodynamic");
|
---|
29 | all_groups.push_back(group);
|
---|
30 |
|
---|
31 |
|
---|
32 | group.clear();
|
---|
33 | group.push_back("safety");
|
---|
34 | group.push_back("\"side effects\"");
|
---|
35 | group.push_back("toxicity");
|
---|
36 | group.push_back("toxicology");
|
---|
37 | all_groups.push_back(group);
|
---|
38 |
|
---|
39 |
|
---|
40 | group.clear();
|
---|
41 | group.push_back("sod");
|
---|
42 | group.push_back("\"superoxide dismutase\"");
|
---|
43 | group.push_back("\"super oxide dismutase\"");
|
---|
44 | all_groups.push_back(group);
|
---|
45 |
|
---|
46 |
|
---|
47 | group.clear();
|
---|
48 | group.push_back("ceruloplasmin");
|
---|
49 | group.push_back("caeruloplasmin");
|
---|
50 | all_groups.push_back(group);
|
---|
51 |
|
---|
52 |
|
---|
53 | // we always convert to lowercase to do the lookup
|
---|
54 | text_t casestemoptions="";
|
---|
55 | text_t termproper="";
|
---|
56 |
|
---|
57 | text_t::const_iterator hash=terms.end();
|
---|
58 | text_t::const_iterator start=terms.begin();
|
---|
59 | while (hash!=start) {
|
---|
60 | if (*hash=='#') break;
|
---|
61 | --hash;
|
---|
62 | }
|
---|
63 | if (hash!=start) { // we found a hash
|
---|
64 | casestemoptions=substr(hash,terms.end());
|
---|
65 | termproper=substr(start,hash);
|
---|
66 | } else {
|
---|
67 | termproper=terms;
|
---|
68 | }
|
---|
69 |
|
---|
70 | // canonicalise termproper to lowercase for lookup
|
---|
71 | lc(termproper.begin(), termproper.end());
|
---|
72 |
|
---|
73 | // look up vectors
|
---|
74 | vector< vector<text_t> >::const_iterator group_here=all_groups.begin();
|
---|
75 | vector< vector<text_t> >::const_iterator group_done=all_groups.end();
|
---|
76 |
|
---|
77 | while (group_here != group_done) {
|
---|
78 | vector<text_t>::const_iterator syn_here=group_here->begin();
|
---|
79 | vector<text_t>::const_iterator syn_done=group_here->end();
|
---|
80 |
|
---|
81 | while (syn_here != syn_done) {
|
---|
82 | if (*syn_here == termproper) // we found a match
|
---|
83 | break;
|
---|
84 | ++syn_here;
|
---|
85 | }
|
---|
86 | if (syn_here != syn_done) { // we found a match in this group
|
---|
87 | syn_here=group_here->begin();
|
---|
88 | terms="(";
|
---|
89 |
|
---|
90 | while (syn_here != syn_done) {
|
---|
91 | terms+= *syn_here;
|
---|
92 | terms+= casestemoptions;
|
---|
93 | if (syn_here != syn_done - 1) {
|
---|
94 | terms+=" | ";
|
---|
95 | }
|
---|
96 | ++syn_here;
|
---|
97 | }
|
---|
98 | terms.push_back(')');
|
---|
99 | break;
|
---|
100 | }
|
---|
101 | ++group_here;
|
---|
102 | }
|
---|
103 |
|
---|
104 | }
|
---|
105 |
|
---|
106 | void insert_equiv_terms(text_t& query) {
|
---|
107 |
|
---|
108 | // we assume that for protemix, we don't have "high" unicode values.
|
---|
109 | // don't do terms inside quotes
|
---|
110 | text_t new_query="";
|
---|
111 |
|
---|
112 | unsigned int query_length=query.size();
|
---|
113 | unsigned int i;
|
---|
114 | bool inside_quotes=false;
|
---|
115 | bool inside_field=false; // we need to OR together the terms inside fields
|
---|
116 | bool in_a_term=false;
|
---|
117 | text_t::const_iterator here = query.begin();
|
---|
118 | text_t::const_iterator end = query.end();
|
---|
119 |
|
---|
120 | text_t::const_iterator term_start=here;
|
---|
121 | // | and &...
|
---|
122 | while (here < end) {
|
---|
123 | if (*here=='"') {
|
---|
124 | new_query.push_back('"');
|
---|
125 | if (inside_quotes==false) {
|
---|
126 | inside_quotes=true;
|
---|
127 | while (*(++here) != '"')
|
---|
128 | new_query.push_back(*here);
|
---|
129 | new_query.push_back('"');
|
---|
130 | } else {
|
---|
131 | inside_quotes=false;
|
---|
132 | }
|
---|
133 | } else if (isspace(*here)) {
|
---|
134 | if (in_a_term) { // we've reached the end of a term
|
---|
135 | // this might include case and stem options...
|
---|
136 | text_t term=substr(term_start,here);
|
---|
137 | equiv_lookup(term);
|
---|
138 | new_query+=term;
|
---|
139 | new_query+=" ";
|
---|
140 | in_a_term=false;
|
---|
141 | }
|
---|
142 | } else if (*here=='[') {
|
---|
143 | // start of a field...
|
---|
144 | } else if (*here==']') {
|
---|
145 | // end of a field...
|
---|
146 | } else if (!in_a_term) { // this char is start of term
|
---|
147 | term_start=here;
|
---|
148 | in_a_term=true;
|
---|
149 | }
|
---|
150 |
|
---|
151 | ++here;
|
---|
152 | }
|
---|
153 |
|
---|
154 | // tidy up for last term
|
---|
155 | if (in_a_term) {
|
---|
156 | text_t term=substr(term_start,here);
|
---|
157 | equiv_lookup(term);
|
---|
158 | new_query+=term;
|
---|
159 | }
|
---|
160 | query=new_query;
|
---|
161 | }
|
---|