Context Navigation

graphSampleError.py@ 38791

Last change on this file since 38791 was 38791, checked in by jc550, 4 months ago
add comments adding context to functions that require it
Property svn:executable set to ``*
File size: 1.8 KB

Line
1	#!/usr/bin/env python3
2
3	# Script to generate a time series graph of percentage of & errors in CrossRef titles
4	# Does not generate a data file! Graphing is handled by matplotlib.
5
6	import pymongo
7	from matplotlib import pyplot as plt
8	from collections import defaultdict
9	import alive_progress
10
11	# Connect to MongoDB
12	client = pymongo.MongoClient("mongodb://localhost")
13	db = client["alpss"]
14	collection = db["crossref"]
15
16	# Query MongoDB to get the relevant data
17	cursor = collection.find({}, {"title": 1, "issued": 1})
18
19	# Process the data to calculate the percentage of titles containing "&amp" by year
20	year_count = defaultdict(int)
21	amp_count = defaultdict(int)
22	count = 0
23
24	for document in cursor:
25	issued_year = document["issued"][0][0] if "issued" in document and document["issued"] else None
26	title_contains_amp = "&amp" in document.get("title", "")
27	#count = count + 1
28	#print(str(count))
29	if issued_year is not None:
30	year_count[repr(issued_year)] += 1
31	if title_contains_amp:
32	amp_count[repr(issued_year)] += 1
33	#if count == 500000:
34	#break
35
36	# Calculate the percentage
37	percentage_data = {year: (amp_count[year] / year_count[year]) * 100 if year_count[year] > 0 else 0 for year in year_count}
38
39	#print(year_count)
40	#print(amp_count)
41
42	year_range_min = 1980
43	year_range_max = 2023
44
45	years = []
46	percentages = []
47
48	for yr in range(year_range_min, year_range_max + 1):
49	years.append(yr)
50	percentages.append(percentage_data[str(yr)])
51
52	# Create a time-series graph using matplotlib
53	#years = list(percentage_data.keys())
54	#percentages = list(percentage_data.values())
55
56	print(percentage_data)
57
58	plt.plot(years, percentages, marker='o')
59	plt.xlabel('Year')
60	plt.ylabel('Percentage of Titles with "&amp"')
61	plt.title('Time Series of Percentage of Titles with "&amp" by Year')
62	plt.grid(True)
63	plt.show()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: other-projects/metadata-encoding/py/using-mongodb/graphSampleError.py@ 38791

Download in other formats: