1 | # unfinished experimental function that was supposed to be a part of the equivalence "comparison" test between different academic databases
|
---|
2 | # for cleaning titles with obviously stupid things in them to test if they WOULD be the same without them
|
---|
3 |
|
---|
4 | import re, sys, argparse
|
---|
5 |
|
---|
6 | # Functions to make title consistent between platforms
|
---|
7 | faceMarkupTags = [
|
---|
8 | "b",
|
---|
9 | "i",
|
---|
10 | "u",
|
---|
11 | "ovl",
|
---|
12 | "sup",
|
---|
13 | "sub",
|
---|
14 | "scp",
|
---|
15 | "tt"
|
---|
16 | ]
|
---|
17 |
|
---|
18 | mathMLTag = "mml"
|
---|
19 |
|
---|
20 | tagRegex = "[<]\/?[^ ]+?[>]"
|
---|
21 |
|
---|
22 | def clear_face_markup(title):
|
---|
23 | # Get rid of all facemarkup tags
|
---|
24 | regexTitle = title
|
---|
25 | for tagString in faceMarkupTags:
|
---|
26 | newString = "<" + tagString + ">"
|
---|
27 | regexTitle = regexTitle.replace(newString, "")
|
---|
28 | newString = "</" + tagString + ">"
|
---|
29 | regexTitle = regexTitle.replace(newString, "")
|
---|
30 |
|
---|
31 | print("regex done: " + regexTitle)
|
---|
32 | return regexTitle
|
---|
33 |
|
---|
34 | def clear_math_ml_tags(title):
|
---|
35 | # find all regex matches and remove mathml tags
|
---|
36 | regexTitle = title
|
---|
37 | regexMatches = re.findall(tagRegex, regexTitle)
|
---|
38 | for match in regexMatches:
|
---|
39 | if match.find(mathMLTag) != -1:
|
---|
40 | regexTitle = regexTitle.replace(match, "")
|
---|
41 | return regexTitle
|
---|
42 |
|
---|
43 | def clear_tags(title):
|
---|
44 | facemarkupRemovedTitle = clear_face_markup(title)
|
---|
45 | return clear_math_ml_tags(facemarkupRemovedTitle)
|
---|
46 |
|
---|
47 | def add_subtitles():
|
---|
48 | return "Title with added subtitles"
|
---|
49 |
|
---|
50 | # commandline interface for checking things
|
---|
51 | def main():
|
---|
52 | parser = argparse.ArgumentParser(
|
---|
53 | prog="sanitise",
|
---|
54 | description="sanitises title input for consistent output"
|
---|
55 | )
|
---|
56 | parser.add_argument("title", help="Title of the article")
|
---|
57 | parser.add_argument("-s", "--subtitle", help="Subtitle (if one provided)")
|
---|
58 | args = parser.parse_args()
|
---|
59 |
|
---|
60 | print(args)
|
---|
61 |
|
---|
62 | tagClearTitle = clear_tags(args.title)
|
---|
63 |
|
---|
64 | print(tagClearTitle)
|
---|
65 |
|
---|
66 |
|
---|
67 | if __name__ == "__main__":
|
---|
68 | main()
|
---|