1 | # Function to take a sample of articles from CrossRef and display information about the percentage of "Pure Latin" in the titles
|
---|
2 |
|
---|
3 | from habanero import Crossref
|
---|
4 | import get_unicode_blocks
|
---|
5 |
|
---|
6 | def get_sample(sampleSize):
|
---|
7 | # Set email address so that I can be put into "polite" pool
|
---|
8 | cr = Crossref(mailto = "[email protected]")
|
---|
9 | # If requested sample size is above 100, then we can't do that and must reduce it
|
---|
10 | if (sampleSize > 100): sampleSize = 100
|
---|
11 | # Generate and send query to CrossRef (Limited Query = 100)
|
---|
12 | query = cr.works(sample=sampleSize)
|
---|
13 | return query
|
---|
14 |
|
---|
15 | def process_sample(sampleQuery):
|
---|
16 | sampleQueryItems = sampleQuery["message"]["items"]
|
---|
17 | sampleDataToReturn = []
|
---|
18 | #for each item in the sample
|
---|
19 | #print(len(sampleQueryItems))
|
---|
20 | for itemNum in range(len(sampleQueryItems)):
|
---|
21 | #print(str(itemNum) + " ", end = "")
|
---|
22 | item = sampleQueryItems[itemNum]
|
---|
23 | if "title" not in item.keys():
|
---|
24 | pass
|
---|
25 | #sampleDataToReturn.append(100)
|
---|
26 | else:
|
---|
27 | title = item["title"][0]
|
---|
28 | #print(title)
|
---|
29 | numLatinChars = 0
|
---|
30 | for char in enumerate(title):
|
---|
31 | #encodedChar = char[1].encode('unicode_escape')
|
---|
32 | encodedChar = int(hex(ord(char[1])), 16)
|
---|
33 | if encodedChar <= 255:
|
---|
34 | numLatinChars = numLatinChars + 1
|
---|
35 | lengthTitle = len(title)
|
---|
36 | if numLatinChars == 0: sampleDataToReturn.append(0)
|
---|
37 | else: sampleDataToReturn.append(numLatinChars/lengthTitle * 100)
|
---|
38 | #print(str(numLatinChars/lengthTitle * 100))
|
---|
39 |
|
---|
40 |
|
---|
41 | #print(sampleDataToReturn)
|
---|
42 | return sampleDataToReturn
|
---|
43 |
|
---|
44 | def analyse_processed_sample(processedSamples):
|
---|
45 | averagePercentLatin = 0
|
---|
46 | numberPureLatin = 0
|
---|
47 | totalNumber = 0
|
---|
48 |
|
---|
49 | for processedSample in processedSamples:
|
---|
50 | for articleData in processedSample:
|
---|
51 | totalNumber = totalNumber + 1
|
---|
52 | if articleData == 100: numberPureLatin = numberPureLatin + 1
|
---|
53 | averagePercentLatin = averagePercentLatin + articleData
|
---|
54 |
|
---|
55 | averagePercentLatin = averagePercentLatin / totalNumber
|
---|
56 |
|
---|
57 | print("Total Number of Articles Analysed: " + str(totalNumber))
|
---|
58 | print("Average Percent Latin: " + str(averagePercentLatin))
|
---|
59 | print("Number Pure Latin: " + str(numberPureLatin))
|
---|
60 |
|
---|
61 | print("How many samples would you like to do? ", end="")
|
---|
62 | numSamples = int(input())
|
---|
63 | print("How big should each sample be? (MAX 100) ", end="")
|
---|
64 | sampleSize = int(input())
|
---|
65 |
|
---|
66 | # Data on the percentage of each title that is comprised of Latin
|
---|
67 | sampleLatinData = []
|
---|
68 |
|
---|
69 | # obtain data from each sample
|
---|
70 | for sample in range(numSamples):
|
---|
71 | queryResult = get_sample(sampleSize)
|
---|
72 | sampleLatinData.append(process_sample(queryResult))
|
---|
73 |
|
---|
74 | analyse_processed_sample(sampleLatinData)
|
---|
75 |
|
---|