-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjson_word_count.py
More file actions
37 lines (23 loc) · 782 Bytes
/
json_word_count.py
File metadata and controls
37 lines (23 loc) · 782 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from bs4 import BeautifulSoup
import glob
import re
import json
total_freq = {}
for xml in glob.iglob('data/metadata/*.xml'):
with open(xml) as f:
bs = BeautifulSoup(f, "lxml-xml")
pub_year = bs.year
year = int(str(pub_year)[6:10])
total_freq[year] = total_freq.get(year, {})
txt = xml.replace("metadata", "ngram1").replace(".xml", "-ngram1.txt")
with open(txt) as t:
for line in t:
sub = re.split("\s+", line)
word = sub[0]
count = int(sub[1])
if total_freq[year].get(word, 0) == 0:
total_freq[year][word] = 0
total_freq[year][word] += count
file = open("count.json", "w")
with file:
json.dump(total_freq, file)