source: other-projects/metadata-encoding/py/using-mongodb/graphSampleError.py@ 38791

Last change on this file since 38791 was 38791, checked in by jc550, 4 months ago

add comments adding context to functions that require it

  • Property svn:executable set to *
File size: 1.8 KB
Line 
1#!/usr/bin/env python3
2
3# Script to generate a time series graph of percentage of & errors in CrossRef titles
4# Does not generate a data file! Graphing is handled by matplotlib.
5
6import pymongo
7from matplotlib import pyplot as plt
8from collections import defaultdict
9import alive_progress
10
11# Connect to MongoDB
12client = pymongo.MongoClient("mongodb://localhost")
13db = client["alpss"]
14collection = db["crossref"]
15
16# Query MongoDB to get the relevant data
17cursor = collection.find({}, {"title": 1, "issued": 1})
18
19# Process the data to calculate the percentage of titles containing "&amp" by year
20year_count = defaultdict(int)
21amp_count = defaultdict(int)
22count = 0
23
24for document in cursor:
25 issued_year = document["issued"][0][0] if "issued" in document and document["issued"] else None
26 title_contains_amp = "&amp" in document.get("title", "")
27 #count = count + 1
28 #print(str(count))
29 if issued_year is not None:
30 year_count[repr(issued_year)] += 1
31 if title_contains_amp:
32 amp_count[repr(issued_year)] += 1
33 #if count == 500000:
34 #break
35
36# Calculate the percentage
37percentage_data = {year: (amp_count[year] / year_count[year]) * 100 if year_count[year] > 0 else 0 for year in year_count}
38
39#print(year_count)
40#print(amp_count)
41
42year_range_min = 1980
43year_range_max = 2023
44
45years = []
46percentages = []
47
48for yr in range(year_range_min, year_range_max + 1):
49 years.append(yr)
50 percentages.append(percentage_data[str(yr)])
51
52# Create a time-series graph using matplotlib
53#years = list(percentage_data.keys())
54#percentages = list(percentage_data.values())
55
56print(percentage_data)
57
58plt.plot(years, percentages, marker='o')
59plt.xlabel('Year')
60plt.ylabel('Percentage of Titles with "&amp"')
61plt.title('Time Series of Percentage of Titles with "&amp" by Year')
62plt.grid(True)
63plt.show()
Note: See TracBrowser for help on using the repository browser.