guoyu/generate_test_data.py

223 lines
6.6 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
生成监狱测试数据
直接生成符合系统要求的Excel文件
"""
import pandas as pd
import random
from datetime import datetime, timedelta
# ===== 系统实际选项配置 =====
# 班级选项
CLASS_OPTIONS = [
'中级班1班',
'初级班1班',
'高级班1班',
'攻坚转换班1班',
'攻坚转换班2班'
]
# 文化程度选项
EDUCATION_OPTIONS = [
'文盲', '小学', '初中', '高中', '中专', '大专', '本科', '研究生'
]
# 性别选项
SEX_OPTIONS = ['', '']
# 状态选项
STATUS_OPTIONS = ['在押', '释放', '外出', '假释']
# 监区选项
PRISON_AREA_OPTIONS = [
'一监区', '二监区', '三监区', '四监区', '五监区',
'六监区', '七监区', '八监区'
]
# 民族选项
ETHNICITY_OPTIONS = [
'汉族', '回族', '维吾尔族', '壮族', '满族', '彝族',
'土家族', '藏族', '苗族', '蒙古族'
]
# 罪名选项
CRIME_OPTIONS = [
'盗窃罪', '抢劫罪', '诈骗罪', '故意伤害罪', '寻衅滋事罪',
'交通肇事罪', '贩卖毒品罪', '非法持有毒品罪', '走私罪',
'受贿罪', '挪用公款罪', '贪污罪', '强奸罪', '故意杀人罪'
]
# 姓氏列表
SURNAMES = [
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', ''
]
# 名字列表
GIVEN_NAMES = [
'', '', '', '秀英', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', ''
]
def generate_name():
"""生成随机中文姓名"""
surname = random.choice(SURNAMES)
# 70%概率双字名30%概率单字名
if random.random() < 0.7:
name = surname + random.choice(GIVEN_NAMES) + random.choice(GIVEN_NAMES)
else:
name = surname + random.choice(GIVEN_NAMES)
return name
def generate_random_date(start_year=2020, end_year=2024):
"""生成随机日期"""
start_date = datetime(start_year, 1, 1)
end_date = datetime(end_year, 12, 31)
time_between = end_date - start_date
days_between = time_between.days
random_days = random.randint(0, days_between)
return start_date + timedelta(days=random_days)
def generate_sentence_dates():
"""生成刑期相关日期"""
# 刑期起日(入监时间)
start_date = generate_random_date(2020, 2024)
# 刑期(月数)
sentence_months = random.choice([12, 18, 24, 36, 48, 60, 72, 84, 96, 120])
# 刑期止日
end_date = start_date + timedelta(days=sentence_months * 30)
return start_date, end_date, sentence_months
def generate_test_data(count=100, start_id=201):
"""
生成测试数据
Args:
count: 生成数据条数
start_id: 起始信息编号
Returns:
DataFrame
"""
print(f"开始生成 {count} 条测试数据...")
data = []
for i in range(count):
# 生成刑期相关日期
sentence_start, sentence_end, sentence_months = generate_sentence_dates()
# 生成一条记录
record = {
'信息编号': start_id + i,
'罪犯姓名': generate_name(),
'监狱': '第一监狱',
'监区': random.choice(PRISON_AREA_OPTIONS),
'班级': random.choice(CLASS_OPTIONS),
'性别': random.choice(SEX_OPTIONS),
'民族': random.choice(ETHNICITY_OPTIONS),
'文化程度': random.choice(EDUCATION_OPTIONS),
'罪名': random.choice(CRIME_OPTIONS),
'刑期': sentence_months,
'刑期起日': sentence_start.strftime('%Y-%m-%d'),
'刑期止日': sentence_end.strftime('%Y-%m-%d'),
'入监时间': sentence_start.strftime('%Y-%m-%d'),
'状态': random.choice(STATUS_OPTIONS)
}
data.append(record)
# 每生成10条显示进度
if (i + 1) % 10 == 0:
print(f" 已生成 {i + 1}/{count}")
df = pd.DataFrame(data)
return df
def print_statistics(df):
"""打印数据统计信息"""
print("\n" + "="*60)
print("📊 生成数据统计")
print("="*60)
print(f"\n✅ 总记录数: {len(df)}")
print("\n✅ 班级分布:")
for cls, count in df['班级'].value_counts().items():
print(f" {cls}: {count}")
print("\n✅ 文化程度分布:")
for edu, count in df['文化程度'].value_counts().items():
print(f" {edu}: {count}")
print("\n✅ 性别分布:")
for sex, count in df['性别'].value_counts().items():
print(f" {sex}: {count}")
print("\n✅ 状态分布:")
for status, count in df['状态'].value_counts().items():
print(f" {status}: {count}")
print("\n✅ 监区分布:")
for area, count in df['监区'].value_counts().items():
print(f" {area}: {count}")
def main():
"""主函数"""
print("="*60)
print("🔧 监狱测试数据生成工具")
print("="*60)
# 配置参数
count = 3200 # 生成数据条数
start_id = 201 # 起始信息编号
output_file = 'test_data.xlsx'
print(f"配置: 生成 {count} 条数据,起始编号 {start_id}")
print(f"输出文件: {output_file}")
print("="*60 + "\n")
# 生成数据
df = generate_test_data(count, start_id)
# 打印统计
print_statistics(df)
# 保存文件
print("\n" + "="*60)
print(f"💾 保存到文件: {output_file}")
df.to_excel(output_file, index=False, engine='openpyxl')
print("✅ 生成完成!")
print(f"📁 文件位置: {output_file}")
print(f"📝 数据行数: {len(df)}")
print(f"📋 列数: {len(df.columns)}")
print("\n前5行数据预览:")
print(df.head().to_string())
print("\n" + "="*60)
print("📖 使用说明")
print("="*60)
print("1. ✅ 使用生成的文件: test_data.xlsx")
print("2. ✅ 在系统中导入该文件")
print("3. ✅ 所有字段均为系统可接受的值")
print("4. ✅ 数据完全随机生成,可重复运行")
print("\n可用班级:")
for cls in CLASS_OPTIONS:
print(f" - {cls}")
print("="*60)
if __name__ == '__main__':
main()