guoyu/log/测试数据/generate_test_data.py
2025-12-13 13:36:18 +08:00

223 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
生成学员测试数据
直接生成符合系统要求的Excel文件
"""
import pandas as pd
import random
from datetime import datetime, timedelta
# ===== 系统实际选项配置 =====
# 班级选项
CLASS_OPTIONS = [
'中级班1班',
'初级班1班',
'高级班1班',
'攻坚转换班1班',
'攻坚转换班2班'
]
# 文化程度选项
EDUCATION_OPTIONS = [
'文盲', '小学', '初中', '高中', '中专', '大专', '本科', '研究生'
]
# 性别选项
SEX_OPTIONS = ['', '']
# 状态选项
STATUS_OPTIONS = ['在押', '释放', '外出', '假释']
# 监区选项
PRISON_AREA_OPTIONS = [
'一监区', '二监区', '三监区', '四监区', '五监区',
'六监区', '七监区', '八监区'
]
# 民族选项
ETHNICITY_OPTIONS = [
'汉族', '回族', '维吾尔族', '壮族', '满族', '彝族',
'土家族', '藏族', '苗族', '蒙古族'
]
# 罪名选项
CRIME_OPTIONS = [
'盗窃罪', '抢劫罪', '诈骗罪', '故意伤害罪', '寻衅滋事罪',
'交通肇事罪', '贩卖毒品罪', '非法持有毒品罪', '走私罪',
'受贿罪', '挪用公款罪', '贪污罪', '强奸罪', '故意杀人罪'
]
# 姓氏列表
SURNAMES = [
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', ''
]
# 名字列表
GIVEN_NAMES = [
'', '', '', '秀英', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', '',
'', '', '', '', '', '', '', '', '', ''
]
def generate_name():
"""生成随机中文姓名"""
surname = random.choice(SURNAMES)
# 70%概率双字名30%概率单字名
if random.random() < 0.7:
name = surname + random.choice(GIVEN_NAMES) + random.choice(GIVEN_NAMES)
else:
name = surname + random.choice(GIVEN_NAMES)
return name
def generate_random_date(start_year=2020, end_year=2024):
"""生成随机日期"""
start_date = datetime(start_year, 1, 1)
end_date = datetime(end_year, 12, 31)
time_between = end_date - start_date
days_between = time_between.days
random_days = random.randint(0, days_between)
return start_date + timedelta(days=random_days)
def generate_sentence_dates():
"""生成刑期相关日期"""
# 刑期起日(入监时间)
start_date = generate_random_date(2020, 2024)
# 刑期(月数)
sentence_months = random.choice([12, 18, 24, 36, 48, 60, 72, 84, 96, 120])
# 刑期止日
end_date = start_date + timedelta(days=sentence_months * 30)
return start_date, end_date, sentence_months
def generate_test_data(count=100, start_id=201):
"""
生成测试数据
Args:
count: 生成数据条数
start_id: 起始信息编号
Returns:
DataFrame
"""
print(f"开始生成 {count} 条测试数据...")
data = []
for i in range(count):
# 生成刑期相关日期
sentence_start, sentence_end, sentence_months = generate_sentence_dates()
# 生成一条记录
record = {
'信息编号': start_id + i,
'罪犯姓名': generate_name(),
'监区': random.choice(PRISON_AREA_OPTIONS),
'班级': random.choice(CLASS_OPTIONS),
'性别': random.choice(SEX_OPTIONS),
'民族': random.choice(ETHNICITY_OPTIONS),
'文化程度': random.choice(EDUCATION_OPTIONS),
'罪名': random.choice(CRIME_OPTIONS),
'刑期': sentence_months,
'刑期起日': sentence_start.strftime('%Y-%m-%d'),
'刑期止日': sentence_end.strftime('%Y-%m-%d'),
'入监时间': sentence_start.strftime('%Y-%m-%d'),
'状态': random.choice(STATUS_OPTIONS)
}
data.append(record)
# 每生成10条显示进度
if (i + 1) % 10 == 0:
print(f" 已生成 {i + 1}/{count}")
df = pd.DataFrame(data)
return df
def print_statistics(df):
"""打印数据统计信息"""
print("\n" + "="*60)
print("📊 生成数据统计")
print("="*60)
print(f"\n✅ 总记录数: {len(df)}")
print("\n✅ 班级分布:")
for cls, count in df['班级'].value_counts().items():
print(f" {cls}: {count}")
print("\n✅ 文化程度分布:")
for edu, count in df['文化程度'].value_counts().items():
print(f" {edu}: {count}")
print("\n✅ 性别分布:")
for sex, count in df['性别'].value_counts().items():
print(f" {sex}: {count}")
print("\n✅ 状态分布:")
for status, count in df['状态'].value_counts().items():
print(f" {status}: {count}")
print("\n✅ 监区分布:")
for area, count in df['监区'].value_counts().items():
print(f" {area}: {count}")
def main():
"""主函数"""
print("="*60)
print("🔧 学员测试数据生成工具")
print("="*60)
# 配置参数
count = 3200
# 生成数据条数
start_id = 200 # 起始信息编号
output_file = 'test_data.xlsx'
print(f"配置: 生成 {count} 条数据,起始编号 {start_id}")
print(f"输出文件: {output_file}")
print("="*60 + "\n")
# 生成数据
df = generate_test_data(count, start_id)
# 打印统计
print_statistics(df)
# 保存文件
print("\n" + "="*60)
print(f"💾 保存到文件: {output_file}")
df.to_excel(output_file, index=False, engine='openpyxl')
print("✅ 生成完成!")
print(f"📁 文件位置: {output_file}")
print(f"📝 数据行数: {len(df)}")
print(f"📋 列数: {len(df.columns)}")
print("\n前5行数据预览:")
print(df.head().to_string())
print("\n" + "="*60)
print("📖 使用说明")
print("="*60)
print("1. ✅ 使用生成的文件: test_data.xlsx")
print("2. ✅ 在系统中导入该文件")
print("3. ✅ 所有字段均为系统可接受的值")
print("4. ✅ 数据完全随机生成,可重复运行")
print("\n可用班级:")
for cls in CLASS_OPTIONS:
print(f" - {cls}")
print("="*60)
if __name__ == '__main__':
main()