source: other-projects/metadata-encoding/py/using-api/blockBlockAnalysis.py@ 38771

Last change on this file since 38771 was 38771, checked in by jc550, 3 months ago

start seperating python

File size: 2.6 KB
Line 
1from habanero import Crossref
2import get_unicode_blocks
3
4def get_sample(sampleSize):
5 # Set email address so that I can be put into "polite" pool
6 cr = Crossref(mailto = "[email protected]")
7 # If requested sample size is above 100, then we can't do that and must reduce it
8 if (sampleSize > 100): sampleSize = 100
9 # Generate and send query to CrossRef (Limited Query = 100)
10 query = cr.works(sample=sampleSize)
11 return query
12
13def process_sample(sampleQuery):
14 sampleQueryItems = sampleQuery["message"]["items"]
15 sampleDataToReturn = []
16 #for each item in the sample
17 #print(len(sampleQueryItems))
18 for itemNum in range(len(sampleQueryItems)):
19 #print(str(itemNum) + " ", end = "")
20 item = sampleQueryItems[itemNum]
21 if "title" not in item.keys():
22 pass
23 #sampleDataToReturn.append(100)
24 else:
25 title = item["title"][0]
26 #print(title)
27 numLatinChars = 0
28 for char in enumerate(title):
29 #encodedChar = char[1].encode('unicode_escape')
30 encodedChar = int(hex(ord(char[1])), 16)
31 if encodedChar <= 255:
32 numLatinChars = numLatinChars + 1
33 lengthTitle = len(title)
34 if numLatinChars == 0: sampleDataToReturn.append(0)
35 else: sampleDataToReturn.append(numLatinChars/lengthTitle * 100)
36 #print(str(numLatinChars/lengthTitle * 100))
37
38
39 #print(sampleDataToReturn)
40 return sampleDataToReturn
41
42def analyse_processed_sample(processedSamples):
43 averagePercentLatin = 0
44 numberPureLatin = 0
45 totalNumber = 0
46
47 for processedSample in processedSamples:
48 for articleData in processedSample:
49 totalNumber = totalNumber + 1
50 if articleData == 100: numberPureLatin = numberPureLatin + 1
51 averagePercentLatin = averagePercentLatin + articleData
52
53 averagePercentLatin = averagePercentLatin / totalNumber
54
55 print("Total Number of Articles Analysed: " + str(totalNumber))
56 print("Average Percent Latin: " + str(averagePercentLatin))
57 print("Number Pure Latin: " + str(numberPureLatin))
58
59print("How many samples would you like to do? ", end="")
60numSamples = int(input())
61print("How big should each sample be? (MAX 100) ", end="")
62sampleSize = int(input())
63
64# Data on the percentage of each title that is comprised of Latin
65sampleLatinData = []
66
67# obtain data from each sample
68for sample in range(numSamples):
69 queryResult = get_sample(sampleSize)
70 sampleLatinData.append(process_sample(queryResult))
71
72analyse_processed_sample(sampleLatinData)
73
Note: See TracBrowser for help on using the repository browser.