source: other-projects/metadata-encoding/py/using-api/compareTitlesWithSubtitlesEquivalence.py@ 38791

Last change on this file since 38791 was 38791, checked in by jc550, 3 months ago

add comments adding context to functions that require it

File size: 4.2 KB
Line 
1# compare titles of openalex and xref, checking to see if the subtitle matters
2# would probably work a lot better if openalex was in a database. major bottleneck on api wait times
3
4import sys, pymongo, json
5from alive_progress import alive_bar
6sys.path.insert(1, 'comparisonTest/')
7from compare import getDOIOpenAlex, getTitleFromOpenAlexObject
8
9#get next document from mongo
10#get title, subtitle, and DOI
11#get title from openalex
12#compare xref title+subtitle with oAlex title
13#count depending on:
14# - titles are directly equivalent, without subtitles
15# - titles are not equivalent, but subtitle makes them equivalent
16# - titles are not equivalent, even with subtitles
17#print to log file
18
19writeFile = open("titleLog.txt", "w", encoding="utf-8")
20
21def printMessage(doi, xRefTitle, oAlexTitle, equivalenceMessage):
22 if(equivalenceMessage == "Titles Equivalent"):
23 print(doi + " : Equivalent")
24 return
25 print("========================")
26 print("DOI: " + doi)
27 print("XREF TITLE: " + xRefTitle)
28 print("OA TITLE: " + oAlexTitle)
29 print("EQUIVALENT: " + equivalenceMessage)
30 print("========================")
31
32 dictToConvert = {"DOI" : doi, "xrefTitle" : xRefTitle ,"openAlexTitle" : oAlexTitle, "msg" : equivalenceMessage}
33 jsonToWrite = json.dumps(dictToConvert, indent=3)
34 writeFile.write(jsonToWrite + "\n")
35 #since it is most likely that someone using this program will not be closing the file handler, we should flush it
36 #so that data is actually displayed
37 writeFile.flush()
38
39def main():
40 #initialise mongo connection
41 mongoClient = pymongo.MongoClient()
42 mongoDatabase = mongoClient["test"]
43 mongoCollection = mongoDatabase["test"]
44
45 #get cursor for searching database
46 documents = mongoCollection.find()
47 count = mongoCollection.count_documents({})
48
49 #for each document in the database
50 with alive_bar(count) as bar:
51 for document in documents:
52 try:
53 #get the xref information
54 xRefTitle = document["title"]
55 xRefSubtitle = None
56 if "subtitle" in document.keys():
57 xRefSubtitle = document["subtitle"]
58 xRefDOI = document["DOI"]
59
60 #now lets get the openalex info
61 oAlexObject = getDOIOpenAlex(xRefDOI)
62 oAlexTitle = getTitleFromOpenAlexObject(oAlexObject)
63
64 #and we must determine equivalence and such...
65 #case 1 titles are same
66 if xRefTitle == oAlexTitle:
67 printMessage(xRefDOI, xRefTitle, oAlexTitle, "Titles Equivalent")
68 continue
69
70 #case 2 titles are not same and subtitle does not exist for this DOI
71 if xRefSubtitle is None or xRefSubtitle == "":
72 printMessage(xRefDOI, xRefTitle, oAlexTitle, "Titles Not Equivalent")
73 continue
74
75 #now we know the titles are not the same... does the openalex title contain the subtitle?
76 #case 3 titles are completely different
77 if xRefSubtitle not in oAlexTitle:
78 printMessage(xRefDOI, xRefTitle, oAlexTitle, "Titles Not Equivalent")
79 continue
80
81 #so the titles are not equivalent... but openalex title contains the subtitle! they are probably a little equivalent...
82 #get all words that are not in the title or subtitle
83 splitOAlexTitle = oAlexTitle.split([xRefTitle, xRefSubtitle])
84 #only one string between them
85 if len(splitOAlexTitle) == 1:
86 printMessage(xRefDOI, xRefTitle + " " + xRefSubtitle, oAlexTitle, "Title + Subtitle Somewhat Equivalent to Title")
87 continue
88 else:
89 printMessage(xRefDOI, xRefTitle + " " + xRefSubtitle, oAlexTitle, "Title + Subtitle Similar to Title but Not Equivalent")
90 continue
91 except Exception as error:
92 #probably from openalex not having DOI
93 print("Error: " + str(error))
94 finally:
95 bar()
96
97 #close the file handler
98 writeFile.close()
99
100main()
Note: See TracBrowser for help on using the repository browser.