Skip to content

提取http响应包中的json输出为csv

python
from glob import glob
import json
import csv
import traceback

def read_file(name):
    with open(name, 'r', encoding='utf-8') as fp:
        return fp.read()

def get_json_from_html(html):
    if 'HTTP/1.1' in html and '\n\n' in html:
        html = html.split('\n\n')[1]
    return json.loads(html)

def gen_html():
  for item in glob("data/*/*"):
    try:
        data = get_json_from_html(read_file(item))['data']['content']
    except:
        data = []
    yield data

def save_contents(name, contents):
    with open(name, 'w', newline='', encoding='utf-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
        for i, data in enumerate(contents):
            if i == 0:
                spamwriter.writerow(data.keys())
            spamwriter.writerow(data.values())
    print('write', name)

page = 0
count = 0
contents = []
for jdata in gen_html():
    try:
        contents.extend(jdata)
        count += 1
        if count == 100:
            save_contents(f'file_{page}_{page+count}.csv', contents)
            page = page + count
            contents = []
            count = 0
    except:
        traceback.print_exc()


if count < 100:
    save_contents(f'file_{page}_{page+count}.csv', contents)

Released under the MIT License.