-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathxml2.py
More file actions
47 lines (37 loc) · 1.64 KB
/
xml2.py
File metadata and controls
47 lines (37 loc) · 1.64 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#import requests
#user_agent_url = 'C:\\Users\\hejia\\Documents\\python\\linkedin\\xml_data.xml'
#xml_data = requests.get(user_agent_url).content
import xml.etree.ElementTree as ET
import pandas as pd
class XML2DataFrame:
def __init__(self, xml_data):
# self.root = ET.XML(xml_data)
self.root = (ET.parse(xml_data)).getroot()
# get root element
def parse_root(self, root):
"""Return a list of dictionaries from the text and attributes of the
children under this XML root."""
return [self.parse_element(child) for child in root.getchildren()]
def parse_element(self, element, parsed=None):
""" Collect {key:attribute} and {tag:text} from thie XML
element and all its children into a single dictionary of strings."""
if parsed is None:
parsed = dict()
for key in element.keys():
if key not in parsed:
parsed[key] = element.attrib.get(key)
if element.text:
parsed[element.tag] = element.text
else:
#raise ValueError('duplicate attribute {0} at element {1}'.format(key, element.getroottree().getpath(element)))
print('there is an error')
""" Apply recursion"""
for child in list(element):
self.parse_element(child, parsed)
return parsed
def process_data(self):
""" Initiate the root XML, parse it, and return a dataframe"""
structure_data = self.parse_root(self.root)
return pd.DataFrame(structure_data)
xml2df = XML2DataFrame('xml_data.xml')
xml_dataframe = xml2df.process_data()