From d222cfe96807a8e9c61e17497d77f5f40a125bd7 Mon Sep 17 00:00:00 2001 From: haotian <2421912570@qq.com> Date: Fri, 21 Feb 2025 10:00:42 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E6=88=90--=E5=AE=8C=E6=88=90=E7=B3=BB?= =?UTF-8?q?=E7=BB=9F=E7=9B=91=E6=8E=A7=E8=8E=B7=E5=8F=96=E7=B3=BB=E7=BB=9F?= =?UTF-8?q?=E6=97=A5=E5=BF=97=E6=96=B9=E6=B3=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/接口文档code.md | 50 +++++- doc/时间计划.md | 2 +- example_system_moniter_get_log.py | 5 + .../__pycache__/system_monitor.cpython-39.pyc | Bin 8463 -> 11370 bytes function/system_monitor.py | 151 ++++++++++++++++++ 5 files changed, 203 insertions(+), 5 deletions(-) create mode 100644 example_system_moniter_get_log.py diff --git a/doc/接口文档code.md b/doc/接口文档code.md index 39a40d5..e874777 100644 --- a/doc/接口文档code.md +++ b/doc/接口文档code.md @@ -706,19 +706,61 @@ Response: ### 3.4 获取系统日志 ```http -GET /api/system/logs?level=error&start_time=2023-08-20T00:00:00 +GET /api/system/logs?level=error&start_time=2025-02-19T00:00:00&end_time=2025-02-19T23:59:59&module=training&page=1&page_size=20 + +Parameters: +- level: 日志级别过滤 (可选: debug, info, warning, error, critical) +- start_time: 开始时间 (可选, 格式: YYYY-MM-DDThh:mm:ss) +- end_time: 结束时间 (可选, 格式: YYYY-MM-DDThh:mm:ss) +- module: 模块名称过滤 (可选: training, data_processing, model, system) +- page: 页码 (默认: 1) +- page_size: 每页数量 (默认: 20) Response: { "status": "success", "logs": [ { - "timestamp": "2023-08-20T10:15:00", + "timestamp": "2025-02-19T10:15:00", "level": "ERROR", "module": "training", - "message": "Out of memory error in GPU 0" + "message": "Out of memory error in GPU 0", + "details": { + "error_type": "RuntimeError", + "gpu_id": 0, + "memory_used": "15.6GB", + "memory_total": "16GB" + }, + "context": { + "experiment_id": "656341556838275234", + "run_id": "7970364d490f4e0aa0375c2db26215f3", + "model": "XGBClassifier" + } } - ] + ], + "pagination": { + "current_page": 1, + "page_size": 20, + "total_pages": 3, + "total_items": 42 + }, + "summary": { + "error_count": 5, + "warning_count": 12, + "info_count": 25, + "most_frequent_error": "Out of memory error", + "most_affected_module": "training" + } +} + +Error Response: +{ + "status": "error", + "message": "获取系统日志失败", + "details": { + "error_type": "FileNotFoundError", + "error_message": "Log file not found" + } } ``` diff --git a/doc/时间计划.md b/doc/时间计划.md index 512e520..8de5b64 100644 --- a/doc/时间计划.md +++ b/doc/时间计划.md @@ -19,7 +19,7 @@ ## 4. 监控系统 (1) - 20250226 - - [ ] 资源监控 + - [x] 资源监控 - [ ] 训练监控 - [ ] 告警系统 - [ ] 日志聚合 diff --git a/example_system_moniter_get_log.py b/example_system_moniter_get_log.py new file mode 100644 index 0000000..d06786e --- /dev/null +++ b/example_system_moniter_get_log.py @@ -0,0 +1,5 @@ +from function.system_monitor import SystemMonitor + +systemMoniter = SystemMonitor() + +print(systemMoniter.get_system_logs()) \ No newline at end of file diff --git a/function/__pycache__/system_monitor.cpython-39.pyc b/function/__pycache__/system_monitor.cpython-39.pyc index 12f1e797364af2ebf31e24311fa4573d230271db..bdbfe6092a7089dc99faa762f6038e51dac3fa11 100644 GIT binary patch delta 4561 zcmb7IYj7OJ5#HI`+k2l*(nn!{_KZFbp5ZlNEzc7gfk!=i%T*u{RC7*RK&+hpl z?{OhEU>>Q-90@$GrjlhoqxSQoDGHi5`6yfr}sv+yPpepl9!%aZ}d}%l0viA3{SPt zQf$>)i6nSYExJdd9_l?SQSTX9Yolbpl=eOM<2zTUiS5}@u9i%lLE5-kgw@!6v;)_t4Hs5ACAeXWd#Wje(UF{FmekY2vTS`$#MQyZlq~ zDSyLtlK6SAvY7<=A*I(H1|v3qScyfOfMNn7NQD3rm)=lbCZwD1RY%no$nE9ts$1(T zfwVnGvPRZ48SBB06taA~PUiHAnJ<@E5Zi8@u`;t=hRM<@LsObETB#bE7ci4Ajg(mz zFxe^;Cc0yN!38^%8P(0qyyzMbGHwEpE>%2d3AvGf?7gl1254)(5vCuj>ZRN%hOV)0 zzQb1xpe(x?VKcw%TfJ``lIRy(kFWtCtq8B!Z9r?@95`j_#R}Vq{ny50&KVRzzYp?0 z|0jV2$|Vsx_@=;(V%HrA%h`505pfmu;&K+*Rv1{~w}-m;p98-Q44|6r z0M!(q2zHE3zw*xPPhQ=U8ch4yt-#g-#))ht1Io(m6#M^tV7H+1WoFH`L5Fqe_E0ZT z9S67`UJdhnD!egry$$~BaCdCEiMm;se;!`3av1}5SgbM48)G7_PhQ&5bdZn)e<9Kn z*o9v8BkblskE{o?e~WA#`VtE$Ch|zBuyT&=!S-?nn}tD?(G5e`eCd+a{B6>VnW(v7 zfi(r)VXJtkbr;#g_qT2*>p5?|qmDXlPZ7ptM|Imnbu*hU7=!6B>qBd|BkV-@I)ccw zy-3LjZ2-0xcEX66FX~Q0UHKmUF7=9qA=Nk0jSz(UE)l(Jwbeks}Zrb0IHUnMrcJpPebp^DfH@s8wXI2`Ux zraG=mb^o?P-Y|3JVr3x6zngq4BGw>-4n2r4%vW^m+Pe>ZBYlh@3st6rYy@c`co-f6L17Bz}1UX!&MNGdHp&=Tk zO;5Wf$g{g^%~rGM>#Ma`O^txn(g;pTQ*}~nwOWn$k2S2;MhHwqh1n>DQw|iiSw42f z0vwar6_9DO{9_861u9D6Ms9>*M7u>M2=iO;#P%4NV}2pgl8|aKOM&)E3!b89<*~+i zx7pN)&{iwjXs**JZJUx~5^ay6!T2Gm6d@9R^HMGTfK-zEB`avPyQPFgW4B5!X`5uV z)A*PSYM-(~;79@-X`x97bVowEMlNi{u4%V5T50DL0as(rIIA8Pm4jr(8__9gr*wwU zo=LTqumX)XD=|fgbgXemq5(xxq%$sPo!tZhHfH;Ui$r<=p2yCb?N$KRJZU8@6{Zr4 zOO~3HY8{r_p`%i*(~Mc2*lQ)v$h9sj*l4D4+EJg91(~89bJPk!g(hS+Y=s(e8gu4_ zbHm)Q!q9Wv3jc2p?VWVjx-IEwqLHLGSjiV9y7B_#4-{mptNN!JRlrG4%lu+7brGzp!O`F}=ceKZm zna|}zk_z#F^zC`LXaC%@U+M=6YO;0JscMe|7r9=Xox^G9k9ymXyP}`u=Ov z=YBZ*;o~!xU*Ye?H~9`6IkKf#++rAfHr~H!p_TThmVBH|3OZh&n!fP-^u*IwU%NP` zV?2I0uO+(ql@_(`nE2?!shLOKUKr0>VBe0r*Yy?3qX)1kWpD&ALl-ww74NN?$sf!- z@w4f37oG7l=O6#*^1F8P?tDQXEStNbhNZi((i?9XET=$AY6NHtw#PBO7mn5O|@MEK@in**!3Q zb`Am0|4+$jwjUe1Xw*b?qPces4c#-uh6Hbmb0OmGn8T}a{sYINR966SywMGimn`XU z`>OiT8`~P5b$eqVYPckI}cSP|jrw#z5bqI8^A1lYIh)i3y46YMxvfUQo0xv`<=JQ6n|}8E)k~MJUVhn0y$z}9N8e#Z^bSh|Rzf!v zagS;W&WthO*dC)=EN0oMDi-J0YN_M4n=hWLc$Zzse}PGwO?ZKAxekiuGqEfgtsK!giS6z5*Bm?$qr&9P8Qvc|Lli>P)Qd z;&mPaBbpClml3;GgBlfU4r*#DmK^Z(W($Q323HG;J?5;bF~9g&x*A%@S-pimR9J!c zL!Fg0@4R-KjMQzVkT2=%1vD!m}?1?!T3t}ws~=3lZbq9~#Ollw%-r`yGQD_hNTsdez5#51WlImus3 z^&SyhY!cMjfn2#zfWra)Q82_VW9R41E$4^r-R5M=0IaNJbOLx`fxHA42+^VEQ2s9y C@`&~rKv=*m3kw7jmIZhSsd)9>PzAhWxygR|$ zS&DtbkFsm5o!@4;P#Z3!`ToE?rtk-WS=Pv>gWs|@_`~2TlX+PhWpVz6)Dude3zh#Q zwIvmlKtKaY^$4w%htfw3cf1wOhdXID!{x}>dN+!GWLbB0&jU!)LI+LBQ>%-n<5{I5 z#AzHd0ZQNpT@SKh5U!-WQ+CxTqGlDBN{}Ms9#Upz)%zm~enQKep0=&pi%RYYLS5yD z$OdBrd`3Rj+J$3PUIO#AvRPbQ1EL1){Dy4D50f}d@B~+42WJnFNW7qrAd8Td1S@zF zWi`5pq&(Ahpr6+N9gi(Cq(Z#s_}j4u@mA7nBY2EAiw(l*Lf3%s&k&}`E#!5ZvR zBF*CjwWtKE0Scm7Edp>14OjT}Led(HtszZPd9Y7)jC%Q68v04cW(^`z9KSgSni65>U)(N4fBk( z@;@6h@d;uzNbo#wZOWnBiKel+|L{O+qKYI3N{cW_<5~`zghSFXU03+LS^2T)E!ISJ zs0J~;szh|4hhJ%)Vw3#y=4V(Rzu)}qI_>F4Y_!w!rW!F!Pqzy0>8t`d^7b^ra|AO4 zqRysC3J^3S_))wEOP*z$)%;Tdic<6?Z_P0whU4vme391Z0KP2lkY@L}`Mo8Xr0e0X zxmdFrm~N>Ic&ZA&lbmOR{6Nc*HbJ?;T*b}NYthq+Iy!Hu*ezd5Y>1~)19-NRsqV0d z*BM?+jgC{qe$6h;{P(iCGNFP zoqB49E+`Ju9d z0rG=Z(ene2p~@F9SgxmIBY257cJ#B$e5~Vm-xS$VJ5d#i`%Zk+xL48)-7`gMoB78b zU0Q~ej}n|Epj7)I*C|-uuF4R#EPBy7QWE*kR2$MfOfqy&vXtmji2qz zuD?nX7J@1xXYe%3SXCot(I5dM3hN^XgI3u8&dWw^duF2 zCYMoww6gnv8YUdzw{ra{8AU(h=-zU{T7WQ(B#}mk|C;NWE|Ml$)pF`b$FXF~1@i<| SwMz{#%IH_mk^-x8H1H2ibj4Nx diff --git a/function/system_monitor.py b/function/system_monitor.py index 83a6dc5..e39d213 100644 --- a/function/system_monitor.py +++ b/function/system_monitor.py @@ -9,6 +9,9 @@ import time import mlflow from mlflow.tracking import MlflowClient import pandas as pd +import json +from collections import Counter +import re class SystemMonitor: """系统资源监控类""" @@ -344,4 +347,152 @@ class SystemMonitor: 'error_type': type(e).__name__, 'error_message': str(e) } + } + + def get_system_logs( + self, + level: Optional[str] = None, + start_time: Optional[str] = None, + end_time: Optional[str] = None, + module: Optional[str] = None, + page: int = 1, + page_size: int = 20 + ) -> Dict: + """ + 获取系统日志 + + Args: + level: 日志级别过滤 + start_time: 开始时间 (YYYY-MM-DDThh:mm:ss) + end_time: 结束时间 (YYYY-MM-DDThh:mm:ss) + module: 模块名称过滤 + page: 页码 + page_size: 每页数量 + + Returns: + 系统日志信息 + """ + try: + # 获取所有日志文件 + log_dir = Path('.log') + log_files = sorted(log_dir.glob('*.log'), reverse=True) + + if not log_files: + return { + 'status': 'error', + 'message': '未找到日志文件', + 'details': { + 'error_type': 'FileNotFoundError', + 'error_message': 'No log files found' + } + } + + # 解析时间范围 + start_dt = pd.to_datetime(start_time) if start_time else None + end_dt = pd.to_datetime(end_time) if end_time else pd.Timestamp.now() + + # 读取并解析日志 + all_logs = [] + level_counts = Counter() + error_types = Counter() + module_counts = Counter() + + log_pattern = re.compile( + r'(?P\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2},\d{3})\s+-\s+' + r'(?P[\w.]+)\s+-\s+' + r'(?P\w+)\s+-\s+' + r'(?P.*?)(?:\s+\{(?P
.*)\})?$' + ) + + for log_file in log_files: + with open(log_file, 'r', encoding='utf-8') as f: + for line in f: + match = log_pattern.match(line.strip()) + if not match: + continue + + log_data = match.groupdict() + log_time = pd.to_datetime(log_data['timestamp']) + + # 时间范围过滤 + if start_dt and log_time < start_dt: + continue + if log_time > end_dt: + continue + + # 级别过滤 + log_level = log_data['level'].upper() + if level and log_level != level.upper(): + continue + + # 模块过滤 + log_module = log_data['name'] + if module and log_module != module: + continue + + # 解析详细信息和上下文 + try: + details = json.loads('{' + log_data.get('details', '') + '}') + except: + details = {} + + # 统计信息 + level_counts[log_level] += 1 + if log_level == 'ERROR': + error_types[log_data['message'].split(':')[0]] += 1 + module_counts[log_module] += 1 + + # 格式化日志记录 + log_entry = { + 'timestamp': log_time.strftime('%Y-%m-%dT%H:%M:%S'), + 'level': log_level, + 'module': log_module, + 'message': log_data['message'], + 'details': details, + 'context': { + k: v for k, v in details.items() + if k in ['experiment_id', 'run_id', 'model'] + } + } + all_logs.append(log_entry) + + # 计算分页 + total_items = len(all_logs) + total_pages = (total_items + page_size - 1) // page_size + start_idx = (page - 1) * page_size + end_idx = min(start_idx + page_size, total_items) + + # 生成摘要信息 + summary = { + 'error_count': level_counts.get('ERROR', 0), + 'warning_count': level_counts.get('WARNING', 0), + 'info_count': level_counts.get('INFO', 0), + 'most_frequent_error': error_types.most_common(1)[0][0] if error_types else None, + 'most_affected_module': module_counts.most_common(1)[0][0] if module_counts else None + } + + self.logger.info(f"成功获取系统日志, 共{total_items}条记录") + + return { + 'status': 'success', + 'logs': all_logs[start_idx:end_idx], + 'pagination': { + 'current_page': page, + 'page_size': page_size, + 'total_pages': total_pages, + 'total_items': total_items + }, + 'summary': summary + } + + except Exception as e: + error_msg = f"获取系统日志失败: {str(e)}" + self.logger.error(error_msg) + return { + 'status': 'error', + 'message': '获取系统日志失败', + 'details': { + 'error_type': type(e).__name__, + 'error_message': str(e) + } } \ No newline at end of file