1 | from habanero import Crossref
|
---|
2 | import get_unicode_blocks
|
---|
3 |
|
---|
4 | def get_sample(sampleSize):
|
---|
5 | # Set email address so that I can be put into "polite" pool
|
---|
6 | cr = Crossref(mailto = "[email protected]")
|
---|
7 | # If requested sample size is above 100, then we can't do that and must reduce it
|
---|
8 | if (sampleSize > 100): sampleSize = 100
|
---|
9 | # Generate and send query to CrossRef (Limited Query = 100)
|
---|
10 | query = cr.works(sample=sampleSize)
|
---|
11 | return query
|
---|
12 |
|
---|
13 | def process_sample(sampleQuery):
|
---|
14 | sampleQueryItems = sampleQuery["message"]["items"]
|
---|
15 | sampleDataToReturn = []
|
---|
16 | #for each item in the sample
|
---|
17 | #print(len(sampleQueryItems))
|
---|
18 | for itemNum in range(len(sampleQueryItems)):
|
---|
19 | #print(str(itemNum) + " ", end = "")
|
---|
20 | item = sampleQueryItems[itemNum]
|
---|
21 | if "title" not in item.keys():
|
---|
22 | pass
|
---|
23 | #sampleDataToReturn.append(100)
|
---|
24 | else:
|
---|
25 | title = item["title"][0]
|
---|
26 | #print(title)
|
---|
27 | numLatinChars = 0
|
---|
28 | for char in enumerate(title):
|
---|
29 | #encodedChar = char[1].encode('unicode_escape')
|
---|
30 | encodedChar = int(hex(ord(char[1])), 16)
|
---|
31 | if encodedChar <= 255:
|
---|
32 | numLatinChars = numLatinChars + 1
|
---|
33 | lengthTitle = len(title)
|
---|
34 | if numLatinChars == 0: sampleDataToReturn.append(0)
|
---|
35 | else: sampleDataToReturn.append(numLatinChars/lengthTitle * 100)
|
---|
36 | #print(str(numLatinChars/lengthTitle * 100))
|
---|
37 |
|
---|
38 |
|
---|
39 | #print(sampleDataToReturn)
|
---|
40 | return sampleDataToReturn
|
---|
41 |
|
---|
42 | def analyse_processed_sample(processedSamples):
|
---|
43 | averagePercentLatin = 0
|
---|
44 | numberPureLatin = 0
|
---|
45 | totalNumber = 0
|
---|
46 |
|
---|
47 | for processedSample in processedSamples:
|
---|
48 | for articleData in processedSample:
|
---|
49 | totalNumber = totalNumber + 1
|
---|
50 | if articleData == 100: numberPureLatin = numberPureLatin + 1
|
---|
51 | averagePercentLatin = averagePercentLatin + articleData
|
---|
52 |
|
---|
53 | averagePercentLatin = averagePercentLatin / totalNumber
|
---|
54 |
|
---|
55 | print("Total Number of Articles Analysed: " + str(totalNumber))
|
---|
56 | print("Average Percent Latin: " + str(averagePercentLatin))
|
---|
57 | print("Number Pure Latin: " + str(numberPureLatin))
|
---|
58 |
|
---|
59 | print("How many samples would you like to do? ", end="")
|
---|
60 | numSamples = int(input())
|
---|
61 | print("How big should each sample be? (MAX 100) ", end="")
|
---|
62 | sampleSize = int(input())
|
---|
63 |
|
---|
64 | # Data on the percentage of each title that is comprised of Latin
|
---|
65 | sampleLatinData = []
|
---|
66 |
|
---|
67 | # obtain data from each sample
|
---|
68 | for sample in range(numSamples):
|
---|
69 | queryResult = get_sample(sampleSize)
|
---|
70 | sampleLatinData.append(process_sample(queryResult))
|
---|
71 |
|
---|
72 | analyse_processed_sample(sampleLatinData)
|
---|
73 |
|
---|