source: other-projects/metadata-encoding/py/using-api/blockBlockAnalysis.py@ 38791

Last change on this file since 38791 was 38791, checked in by jc550, 4 months ago

add comments adding context to functions that require it

File size: 2.7 KB
Line 
1# Function to take a sample of articles from CrossRef and display information about the percentage of "Pure Latin" in the titles
2
3from habanero import Crossref
4import get_unicode_blocks
5
6def get_sample(sampleSize):
7 # Set email address so that I can be put into "polite" pool
8 cr = Crossref(mailto = "[email protected]")
9 # If requested sample size is above 100, then we can't do that and must reduce it
10 if (sampleSize > 100): sampleSize = 100
11 # Generate and send query to CrossRef (Limited Query = 100)
12 query = cr.works(sample=sampleSize)
13 return query
14
15def process_sample(sampleQuery):
16 sampleQueryItems = sampleQuery["message"]["items"]
17 sampleDataToReturn = []
18 #for each item in the sample
19 #print(len(sampleQueryItems))
20 for itemNum in range(len(sampleQueryItems)):
21 #print(str(itemNum) + " ", end = "")
22 item = sampleQueryItems[itemNum]
23 if "title" not in item.keys():
24 pass
25 #sampleDataToReturn.append(100)
26 else:
27 title = item["title"][0]
28 #print(title)
29 numLatinChars = 0
30 for char in enumerate(title):
31 #encodedChar = char[1].encode('unicode_escape')
32 encodedChar = int(hex(ord(char[1])), 16)
33 if encodedChar <= 255:
34 numLatinChars = numLatinChars + 1
35 lengthTitle = len(title)
36 if numLatinChars == 0: sampleDataToReturn.append(0)
37 else: sampleDataToReturn.append(numLatinChars/lengthTitle * 100)
38 #print(str(numLatinChars/lengthTitle * 100))
39
40
41 #print(sampleDataToReturn)
42 return sampleDataToReturn
43
44def analyse_processed_sample(processedSamples):
45 averagePercentLatin = 0
46 numberPureLatin = 0
47 totalNumber = 0
48
49 for processedSample in processedSamples:
50 for articleData in processedSample:
51 totalNumber = totalNumber + 1
52 if articleData == 100: numberPureLatin = numberPureLatin + 1
53 averagePercentLatin = averagePercentLatin + articleData
54
55 averagePercentLatin = averagePercentLatin / totalNumber
56
57 print("Total Number of Articles Analysed: " + str(totalNumber))
58 print("Average Percent Latin: " + str(averagePercentLatin))
59 print("Number Pure Latin: " + str(numberPureLatin))
60
61print("How many samples would you like to do? ", end="")
62numSamples = int(input())
63print("How big should each sample be? (MAX 100) ", end="")
64sampleSize = int(input())
65
66# Data on the percentage of each title that is comprised of Latin
67sampleLatinData = []
68
69# obtain data from each sample
70for sample in range(numSamples):
71 queryResult = get_sample(sampleSize)
72 sampleLatinData.append(process_sample(queryResult))
73
74analyse_processed_sample(sampleLatinData)
75
Note: See TracBrowser for help on using the repository browser.