1 | #!/usr/bin/env python3
|
---|
2 |
|
---|
3 | import pymongo
|
---|
4 | from matplotlib import pyplot as plt
|
---|
5 | from collections import defaultdict
|
---|
6 | import alive_progress
|
---|
7 |
|
---|
8 | # Connect to MongoDB
|
---|
9 | client = pymongo.MongoClient("mongodb://localhost")
|
---|
10 | db = client["alpss"]
|
---|
11 | collection = db["crossref"]
|
---|
12 |
|
---|
13 | # Query MongoDB to get the relevant data
|
---|
14 | cursor = collection.find({}, {"title": 1, "issued": 1})
|
---|
15 |
|
---|
16 | # Process the data to calculate the percentage of titles containing "&" by year
|
---|
17 | year_count = defaultdict(int)
|
---|
18 | amp_count = defaultdict(int)
|
---|
19 | count = 0
|
---|
20 |
|
---|
21 | for document in cursor:
|
---|
22 | issued_year = document["issued"][0][0] if "issued" in document and document["issued"] else None
|
---|
23 | title_contains_amp = "&" in document.get("title", "")
|
---|
24 | #count = count + 1
|
---|
25 | #print(str(count))
|
---|
26 | if issued_year is not None:
|
---|
27 | year_count[repr(issued_year)] += 1
|
---|
28 | if title_contains_amp:
|
---|
29 | amp_count[repr(issued_year)] += 1
|
---|
30 | #if count == 500000:
|
---|
31 | #break
|
---|
32 |
|
---|
33 | # Calculate the percentage
|
---|
34 | percentage_data = {year: (amp_count[year] / year_count[year]) * 100 if year_count[year] > 0 else 0 for year in year_count}
|
---|
35 |
|
---|
36 | #print(year_count)
|
---|
37 | #print(amp_count)
|
---|
38 |
|
---|
39 | year_range_min = 1980
|
---|
40 | year_range_max = 2023
|
---|
41 |
|
---|
42 | years = []
|
---|
43 | percentages = []
|
---|
44 |
|
---|
45 | for yr in range(year_range_min, year_range_max + 1):
|
---|
46 | years.append(yr)
|
---|
47 | percentages.append(percentage_data[str(yr)])
|
---|
48 |
|
---|
49 | # Create a time-series graph using matplotlib
|
---|
50 | #years = list(percentage_data.keys())
|
---|
51 | #percentages = list(percentage_data.values())
|
---|
52 |
|
---|
53 | print(percentage_data)
|
---|
54 |
|
---|
55 | plt.plot(years, percentages, marker='o')
|
---|
56 | plt.xlabel('Year')
|
---|
57 | plt.ylabel('Percentage of Titles with "&"')
|
---|
58 | plt.title('Time Series of Percentage of Titles with "&" by Year')
|
---|
59 | plt.grid(True)
|
---|
60 | plt.show()
|
---|