source: other-projects/metadata-encoding/py/comparisonTest/sanitise.py@ 38791

Last change on this file since 38791 was 38791, checked in by jc550, 4 months ago

add comments adding context to functions that require it

File size: 1.8 KB
Line 
1# unfinished experimental function that was supposed to be a part of the equivalence "comparison" test between different academic databases
2# for cleaning titles with obviously stupid things in them to test if they WOULD be the same without them
3
4import re, sys, argparse
5
6# Functions to make title consistent between platforms
7faceMarkupTags = [
8 "b",
9 "i",
10 "u",
11 "ovl",
12 "sup",
13 "sub",
14 "scp",
15 "tt"
16]
17
18mathMLTag = "mml"
19
20tagRegex = "[<]\/?[^ ]+?[>]"
21
22def clear_face_markup(title):
23 # Get rid of all facemarkup tags
24 regexTitle = title
25 for tagString in faceMarkupTags:
26 newString = "<" + tagString + ">"
27 regexTitle = regexTitle.replace(newString, "")
28 newString = "</" + tagString + ">"
29 regexTitle = regexTitle.replace(newString, "")
30
31 print("regex done: " + regexTitle)
32 return regexTitle
33
34def clear_math_ml_tags(title):
35 # find all regex matches and remove mathml tags
36 regexTitle = title
37 regexMatches = re.findall(tagRegex, regexTitle)
38 for match in regexMatches:
39 if match.find(mathMLTag) != -1:
40 regexTitle = regexTitle.replace(match, "")
41 return regexTitle
42
43def clear_tags(title):
44 facemarkupRemovedTitle = clear_face_markup(title)
45 return clear_math_ml_tags(facemarkupRemovedTitle)
46
47def add_subtitles():
48 return "Title with added subtitles"
49
50# commandline interface for checking things
51def main():
52 parser = argparse.ArgumentParser(
53 prog="sanitise",
54 description="sanitises title input for consistent output"
55 )
56 parser.add_argument("title", help="Title of the article")
57 parser.add_argument("-s", "--subtitle", help="Subtitle (if one provided)")
58 args = parser.parse_args()
59
60 print(args)
61
62 tagClearTitle = clear_tags(args.title)
63
64 print(tagClearTitle)
65
66
67if __name__ == "__main__":
68 main()
Note: See TracBrowser for help on using the repository browser.