1 | #!/usr/bin/env python3
|
---|
2 |
|
---|
3 | # Script to generate a time series graph of percentage of & errors in CrossRef titles
|
---|
4 | # Does not generate a data file! Graphing is handled by matplotlib.
|
---|
5 |
|
---|
6 | import pymongo
|
---|
7 | from matplotlib import pyplot as plt
|
---|
8 | from collections import defaultdict
|
---|
9 | import alive_progress
|
---|
10 |
|
---|
11 | # Connect to MongoDB
|
---|
12 | client = pymongo.MongoClient("mongodb://localhost")
|
---|
13 | db = client["alpss"]
|
---|
14 | collection = db["crossref"]
|
---|
15 |
|
---|
16 | # Query MongoDB to get the relevant data
|
---|
17 | cursor = collection.find({}, {"title": 1, "issued": 1})
|
---|
18 |
|
---|
19 | # Process the data to calculate the percentage of titles containing "&" by year
|
---|
20 | year_count = defaultdict(int)
|
---|
21 | amp_count = defaultdict(int)
|
---|
22 | count = 0
|
---|
23 |
|
---|
24 | for document in cursor:
|
---|
25 | issued_year = document["issued"][0][0] if "issued" in document and document["issued"] else None
|
---|
26 | title_contains_amp = "&" in document.get("title", "")
|
---|
27 | #count = count + 1
|
---|
28 | #print(str(count))
|
---|
29 | if issued_year is not None:
|
---|
30 | year_count[repr(issued_year)] += 1
|
---|
31 | if title_contains_amp:
|
---|
32 | amp_count[repr(issued_year)] += 1
|
---|
33 | #if count == 500000:
|
---|
34 | #break
|
---|
35 |
|
---|
36 | # Calculate the percentage
|
---|
37 | percentage_data = {year: (amp_count[year] / year_count[year]) * 100 if year_count[year] > 0 else 0 for year in year_count}
|
---|
38 |
|
---|
39 | #print(year_count)
|
---|
40 | #print(amp_count)
|
---|
41 |
|
---|
42 | year_range_min = 1980
|
---|
43 | year_range_max = 2023
|
---|
44 |
|
---|
45 | years = []
|
---|
46 | percentages = []
|
---|
47 |
|
---|
48 | for yr in range(year_range_min, year_range_max + 1):
|
---|
49 | years.append(yr)
|
---|
50 | percentages.append(percentage_data[str(yr)])
|
---|
51 |
|
---|
52 | # Create a time-series graph using matplotlib
|
---|
53 | #years = list(percentage_data.keys())
|
---|
54 | #percentages = list(percentage_data.values())
|
---|
55 |
|
---|
56 | print(percentage_data)
|
---|
57 |
|
---|
58 | plt.plot(years, percentages, marker='o')
|
---|
59 | plt.xlabel('Year')
|
---|
60 | plt.ylabel('Percentage of Titles with "&"')
|
---|
61 | plt.title('Time Series of Percentage of Titles with "&" by Year')
|
---|
62 | plt.grid(True)
|
---|
63 | plt.show()
|
---|