添加PaddleOCr项目

This commit is contained in:
haotian 2025-08-14 16:21:47 +08:00
parent b6bfe484be
commit cb8384c5bb
1554 changed files with 459104 additions and 0 deletions

34
PaddleOCR-3.1.0/.gitignore vendored Normal file
View File

@ -0,0 +1,34 @@
# Byte-compiled / optimized / DLL files
__pycache__/
.ipynb_checkpoints/
*.py[cod]
*$py.class
# C extensions
*.so
inference/
inference_results/
output/
train_data/
log/
*.DS_Store
*.vs
*.user
*~
*.vscode
*.idea
*.log
.clang-format
.clang_format.hook
build/
dist/
paddleocr.egg-info/
/deploy/android_demo/app/OpenCV/
/deploy/android_demo/app/PaddleLite/
/deploy/android_demo/app/.cxx/
/deploy/android_demo/app/cache/
test_tipc/web/models/
test_tipc/web/node_modules/

View File

@ -0,0 +1,45 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-added-large-files
args: ['--maxkb=512']
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
- id: end-of-file-fixer
- id: trailing-whitespace
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.5
hooks:
- id: remove-crlf
- id: remove-tabs
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat
entry: bash .clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
# For Python files
- repo: https://github.com/psf/black.git
rev: 24.10.0
hooks:
- id: black
files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
# Flake8
- repo: https://github.com/pycqa/flake8
rev: 7.1.1
hooks:
- id: flake8
args:
- --count
- --select=E9,F63,F7,F82,E721
- --show-source
- --statistics
exclude: ^benchmark/|^test_tipc/

View File

@ -0,0 +1,3 @@
[style]
based_on_style = pep8
column_limit = 80

201
PaddleOCR-3.1.0/LICENSE Normal file
View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,22 @@
prune .github
prune applications
prune benchmark
prune configs
prune deploy
prune doc
prune docs
prune overrides
prune ppocr/ext_op
prune ppocr/losses
prune ppocr/metrics
prune ppocr/modeling
prune ppocr/optimizer
prune ppstructure/docs
prune test_tipc
prune tests
exclude .clang_format.hook
exclude .gitignore
exclude .pre-commit-config.yaml
exclude .style.yapf
exclude mkdocs.yml
exclude train.sh

353
PaddleOCR-3.1.0/README.md Normal file
View File

@ -0,0 +1,353 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR Banner">
</p>
<!-- language -->
English | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.83.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 Introduction
Since its initial release, PaddleOCR has gained widespread acclaim across academia, industry, and research communities, thanks to its cutting-edge algorithms and proven performance in real-world applications. It's already powering popular open-source projects like Umi-OCR, OmniParser, MinerU, and RAGFlow, making it the go-to OCR toolkit for developers worldwide.
On May 20, 2025, the PaddlePaddle team unveiled PaddleOCR 3.0, fully compatible with the official release of the **PaddlePaddle 3.0** framework. This update further **boosts text-recognition accuracy**, adds support for **multiple text-type recognition** and **handwriting recognition**, and meets the growing demand from large-model applications for **high-precision parsing of complex documents**. When combined with the **ERNIE 4.5 Turbo**, it significantly enhances key-information extraction accuracy. PaddleOCR 3.0 also introduces support for Chinese Heterogeneous AI Accelerators such as **KUNLUNXIN** and **Ascend**. For the complete usage documentation, please refer to the [PaddleOCR 3.0 Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
Three Major New Features in PaddleOCR 3.0:
- Universal-Scene Text Recognition Model [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): A single model that handles five different text types plus complex handwriting. Overall recognition accuracy has increased by 13 percentage points over the previous generation. [Online Demo](https://aistudio.baidu.com/community/app/91660/webUI)
- General Document-Parsing Solution [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Delivers high-precision parsing of multi-layout, multi-scene PDFs, outperforming many open- and closed-source solutions on public benchmarks. [Online Demo](https://aistudio.baidu.com/community/app/518494/webUI)
- Intelligent Document-Understanding Solution [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Natively powered by the ERNIE 4.5 Turbo, achieving 15 percentage points higher accuracy than its predecessor. [Online Demo](https://aistudio.baidu.com/community/app/518493/webUI)
In addition to providing an outstanding model library, PaddleOCR 3.0 also offers user-friendly tools covering model training, inference, and service deployment, so developers can rapidly bring AI applications to production.
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR Architecture">
</p>
</div>
## 📣 Recent updates
#### **2025.06.29: Release of PaddleOCR 3.1.0**, includes:
- **Key Models and Pipelines:**
- **Added PP-OCRv5 Multilingual Text Recognition Model**, which supports the training and inference process for text recognition models in 37 languages, including French, Spanish, Portuguese, Russian, Korean, etc. **Average accuracy improved by over 30%.** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- Upgraded the **PP-Chart2Table model** in PP-StructureV3, further enhancing the capability of converting charts to tables. On internal custom evaluation sets, the metric (RMS-F1) **increased by 9.36 percentage points (71.24% -> 80.60%).**
- Newly launched **document translation pipeline, PP-DocTranslation, based on PP-StructureV3 and ERNIE 4.5 Turbo**, which supports the translation of Markdown format documents, various complex-layout PDF documents, and document images, with the results saved as Markdown format documents. [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
- **New MCP server:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
- **Supports both OCR and PP-StructureV3 pipelines.**
- Supports three working modes: local Python library, AIStudio Community Cloud Service, and self-hosted service.
- Supports invoking local services via stdio and remote services via Streamable HTTP.
- **Documentation Optimization:** Improved the descriptions in some user guides for a smoother reading experience.
#### **2025.06.26: Release of PaddleOCR 3.0.3**, includes:
- Bug Fix: Resolved the issue where the `enable_mkldnn` parameter was not effective, restoring the default behavior of using MKL-DNN for CPU inference.
#### 🔥🔥 **2025.06.19: Release of PaddleOCR 3.0.2**, includes:
- **New Features:**
- The default download source has been changed from `BOS` to `HuggingFace`. Users can also change the environment variable `PADDLE_PDX_MODEL_SOURCE` to `BOS` to set the model download source back to Baidu Object Storage (BOS).
- Added service invocation examples for six languages—C++, Java, Go, C#, Node.js, and PHP—for pipelines like PP-OCRv5, PP-StructureV3, and PP-ChatOCRv4.
- Improved the layout partition sorting algorithm in the PP-StructureV3 pipeline, enhancing the sorting logic for complex vertical layouts to deliver better results.
- Enhanced model selection logic: when a language is specified but a model version is not, the system will automatically select the latest model version supporting that language.
- Set a default upper limit for MKL-DNN cache size to prevent unlimited growth, while also allowing users to configure cache capacity.
- Updated default configurations for high-performance inference to support Paddle MKL-DNN acceleration and optimized the logic for automatic configuration selection for smarter choices.
- Adjusted the logic for obtaining the default device to consider the actual support for computing devices by the installed Paddle framework, making program behavior more intuitive.
- Added Android example for PP-OCRv5. [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
- **Bug Fixes:**
- Fixed an issue with some CLI parameters in PP-StructureV3 not taking effect.
- Resolved an issue where `export_paddlex_config_to_yaml` would not function correctly in certain cases.
- Corrected the discrepancy between the actual behavior of `save_path` and its documentation description.
- Fixed potential multithreading errors when using MKL-DNN in basic service deployment.
- Corrected channel order errors in image preprocessing for the Latex-OCR model.
- Fixed channel order errors in saving visualized images within the text recognition module.
- Resolved channel order errors in visualized table results within PP-StructureV3 pipeline.
- Fixed an overflow issue in the calculation of `overlap_ratio` under extremely special circumstances in the PP-StructureV3 pipeline.
- **Documentation Improvements:**
- Updated the description of the `enable_mkldnn` parameter in the documentation to accurately reflect the program's actual behavior.
- Fixed errors in the documentation regarding the `lang` and `ocr_version` parameters.
- Added instructions for exporting production line configuration files via CLI.
- Fixed missing columns in the performance data table for PP-OCRv5.
- Refined benchmark metrics for PP-StructureV3 across different configurations.
- **Others:**
- Relaxed version restrictions on dependencies like numpy and pandas, restoring support for Python 3.12.
<details>
<summary><strong>History Log</strong></summary>
#### **🔥🔥 2025.06.05: Release of PaddleOCR 3.0.1, includes:**
- **Optimisation of certain models and model configurations:**
- Updated the default model configuration for PP-OCRv5, changing both detection and recognition from mobile to server models. To improve default performance in most scenarios, the parameter `limit_side_len` in the configuration has been changed from 736 to 64.
- Added a new text line orientation classification model `PP-LCNet_x1_0_textline_ori` with an accuracy of 99.42%. The default text line orientation classifier for OCR, PP-StructureV3, and PP-ChatOCRv4 pipelines has been updated to this model.
- Optimised the text line orientation classification model `PP-LCNet_x0_25_textline_ori`, improving accuracy by 3.3 percentage points to a current accuracy of 98.85%.
- **Optimizations and fixes for some issues in version 3.0.0, [details](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
🔥🔥2025.05.20: Official Release of **PaddleOCR v3.0**, including:
- **PP-OCRv5**: High-Accuracy Text Recognition Model for All Scenarios - Instant Text from Images/PDFs.
1. 🌐 Single-model support for **five** text types - Seamlessly process **Simplified Chinese, Traditional Chinese, Simplified Chinese Pinyin, English** and **Japanese** within a single model.
2. ✍️ Improved **handwriting recognition**: Significantly better at complex cursive scripts and non-standard handwriting.
3. 🎯 **13-point accuracy gain** over PP-OCRv4, achieving state-of-the-art performance across a variety of real-world scenarios.
- **PP-StructureV3**: General-Purpose Document Parsing Unleash SOTA Images/PDFs Parsing for Real-World Scenarios!
1. 🧮 **High-Accuracy multi-scene PDF parsing**, leading both open- and closed-source solutions on the OmniDocBench benchmark.
2. 🧠 Specialized capabilities include **seal recognition**, **chart-to-table conversion**, **table recognition with nested formulas/images**, **vertical text document parsing**, and **complex table structure analysis**.
- **PP-ChatOCRv4**: Intelligent Document Understanding Extract Key Information, not just text from Images/PDFs.
1. 🔥 **15-point accuracy gain** in key-information extraction on PDF/PNG/JPG files over the previous generation.
2. 💻 Native support for **ERNIE 4.5 Turbo**, with compatibility for large-model deployments via PaddleNLP, Ollama, vLLM, and more.
3. 🤝 Integrated [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), enabling extraction and understanding of printed text, handwriting, seals, tables, charts, and other common elements in complex documents.
[History Log](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
</details>
## ⚡ Quick Start
### 1. Run online demo
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. Installation
Install PaddlePaddle refer to [Installation Guide](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), after then, install the PaddleOCR toolkit.
```bash
# Install paddleocr
pip install paddleocr
```
### 3. Run inference by CLI
```bash
# Run PP-OCRv5 inference
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# Run PP-StructureV3 inference
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# Get the Qianfan API Key at first, and then run PP-ChatOCRv4 inference
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# Get more information about "paddleocr ocr"
paddleocr ocr --help
```
### 4. Run inference by API
**4.1 PP-OCRv5 Example**
```python
# Initialize PaddleOCR instance
from paddleocr import PaddleOCR
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# Run OCR inference on a sample image
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# Visualize the results and save the JSON results
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 PP-StructureV3 Example</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# For Image
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# Visualize the results and save the JSON results
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 PP-ChatOCRv4 Example</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# If a multimodal large model is used, the local mllm service needs to be started. You can refer to the documentation: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md performs deployment and updates the mllm_chat_bot_config configuration.
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
### 5. Chinese Heterogeneous AI Accelerators
- [Huawei Ascend](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
- [KUNLUNXIN](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
## ⛰️ Advanced Tutorials
- [PP-OCRv5 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [PP-StructureV3 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [PP-ChatOCRv4 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 Quick Overview of Execution Results
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
</p>
</div>
## 👩‍👩‍👧‍👦 Community
| PaddlePaddle WeChat official account | Join the tech discussion group |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 😃 Awesome Projects Leveraging PaddleOCR
PaddleOCR wouldn't be where it is today without its incredible community! 💗 A massive thank you to all our longtime partners, new collaborators, and everyone who's poured their passion into PaddleOCR — whether we've named you or not. Your support fuels our fire!
| Project Name | Description |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|RAG engine based on deep document understanding.|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Multi-type Document to Markdown Conversion Tool|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Free, Open-source, Batch Offline OCR Software.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Screen Parsing tool for Pure Vision Based GUI Agent.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Question and Answer based on Anything.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|A powerful open-source toolkit designed to efficiently extract high-quality content from complex and diverse PDF documents.|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |Recognize text on the screen, translate it and show the translation results in real time.|
| [Learn more projects](./awesome_projects.md) | [More projects based on PaddleOCR](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 Contributors
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 License
This project is released under the [Apache 2.0 license](LICENSE).
## 🎓 Citation
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1,413 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR Banner">
</p>
<!-- language -->
<p>
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | العربية
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.83.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</p>
</div>
<div dir="rtl">
## 🚀 مقدمة
منذ إصداره الأولي، حظي PaddleOCR بتقدير واسع النطاق في الأوساط الأكاديمية والصناعية والبحثية، بفضل خوارزمياته المتطورة وأدائه المثبت في تطبيقات العالم الحقيقي. وهو يدعم بالفعل مشاريع مفتوحة المصدر شهيرة مثل Umi-OCR، و OmniParser، و MinerU، و RAGFlow، مما يجعله مجموعة أدوات التعرف الضوئي على الحروف المفضلة للمطورين في جميع أنحاء العالم.
في 20 مايو 2025، كشف فريق PaddlePaddle عن PaddleOCR 3.0، المتوافق تمامًا مع الإصدار الرسمي لإطار العمل **PaddlePaddle 3.0**. يعزز هذا التحديث **دقة التعرف على النصوص**، ويضيف دعمًا لـ **التعرف على أنواع نصوص متعددة** و **التعرف على الكتابة اليدوية**، ويلبي الطلب المتزايد من التطبيقات القائمة على النماذج الكبيرة على **التحليل عالي الدقة للمستندات المعقدة**. عند دمجه مع **ERNIE 4.5 Turbo**، فإنه يعزز بشكل كبير دقة استخراج المعلومات الرئيسية. كما يقدم PaddleOCR 3.0 دعمًا لمسرعات الذكاء الاصطناعي الصينية غير المتجانسة مثل **KUNLUNXIN** و **Ascend**. للحصول على وثائق الاستخدام الكاملة، يرجى الرجوع إلى [وثائق PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
##### ثلاث ميزات رئيسية جديدة في PaddleOCR 3.0:
نموذج التعرف على النصوص في جميع السيناريوهات [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): نموذج واحد يعالج خمسة أنواع مختلفة من النصوص بالإضافة إلى الكتابة اليدوية المعقدة. زادت دقة التعرف الإجمالية بمقدار 13 نقطة مئوية عن الجيل السابق. [تجربة مباشرة](https://aistudio.baidu.com/community/app/91660/webUI)
حل تحليل المستندات العام [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): يقدم تحليلًا عالي الدقة لملفات PDF متعددة التخطيطات والسيناريوهات، متفوقًا على العديد من الحلول المفتوحة والمغلقة المصدر في المعايير العامة. [تجربة مباشرة](https://aistudio.baidu.com/community/app/518494/webUI)
حل فهم المستندات الذكي [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): مدعوم أصلاً بنموذج **ERNIE 4.5 Turbo**، ويحقق دقة أعلى بنسبة 15 نقطة مئوية من سابقه. [تجربة مباشرة](https://aistudio.baidu.com/community/app/518493/webUI)
بالإضافة إلى توفير مكتبة نماذج متميزة، يقدم PaddleOCR 3.0 أيضًا أدوات سهلة الاستخدام تغطي تدريب النماذج والاستدلال ونشر الخدمات، حتى يتمكن المطورون من إدخال تطبيقات الذكاء الاصطناعي إلى الإنتاج بسرعة.
<p align="center">
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR Architecture">
</p>
## 📣 آخر التحديثات
<h4 dir="rtl">🔥🔥<strong>2025.06.29: إصدار <bdi dir="ltr">PaddleOCR 3.1.0</bdi>، يتضمن:</strong></h4>
<ul dir="rtl">
<li><strong>النماذج وخطوط الأنابيب الرئيسية:</strong>
<ul dir="rtl">
<li>
<strong>تمت إضافة نموذج التعرف على النصوص متعدد اللغات <bdi dir="ltr">PP-OCRv5</bdi></strong>، والذي يدعم تدريب واستدلال نماذج التعرف على النصوص في 37 لغة، بما في ذلك الفرنسية، الإسبانية، البرتغالية، الروسية، الكورية وغيرها. <strong>تحسنت الدقة المتوسطة بنسبة تزيد عن 30%.</strong>
<a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html">التفاصيل</a>
</li>
<li>
تم ترقية نموذج <bdi dir="ltr">PP-Chart2Table</bdi> في <bdi dir="ltr">PP-StructureV3</bdi>، مما عزز أكثر من إمكانية تحويل المخططات إلى جداول. في مجموعات التقييم الداخلية، ارتفع المقياس (<bdi dir="ltr">RMS-F1</bdi>) بمقدار <strong>9.36 نقطة مئوية (71.24% → 80.60%)</strong>.
</li>
<li>
تم إطلاق خط أنابيب ترجمة المستندات الجديد <bdi dir="ltr">PP-DocTranslation</bdi>، المبني على <bdi dir="ltr">PP-StructureV3</bdi> و <bdi dir="ltr">ERNIE 4.5 Turbo</bdi>، ويدعم ترجمة مستندات <bdi dir="ltr">Markdown</bdi>، ومستندات <bdi dir="ltr">PDF</bdi> ذات التنسيقات المعقدة وصور المستندات، مع حفظ النتائج كمستندات <bdi dir="ltr">Markdown</bdi>.
<a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html">التفاصيل</a>
</li>
</ul>
</li>
<li><strong>دعم MCP:</strong><a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html">التفاصيل</a>
<ul dir="rtl">
<li>
<strong>يدعم خطوط أنابيب OCR و PP-StructureV3.</strong>
</li>
<li>
يدعم ثلاثة أوضاع عمل: مكتبة Python المحلية، خدمة السحابة المجتمعية AIStudio، وخدمة الاستضافة الذاتية.
</li>
<li>
يدعم استدعاء الخدمات المحلية عبر stdio والخدمات البعيدة عبر Streamable HTTP.
</li>
</ul>
</li>
<li><strong>تحسين الوثائق:</strong>
<ul dir="rtl">
<li>تم تحسين الشروحات في بعض الأدلة للمستخدمين لتوفير تجربة قراءة أكثر سلاسة.</li>
</ul>
</li>
</ul>
<h4 dir="rtl">🔥🔥<strong>2025.06.26: إصدار <bdi dir="ltr">PaddleOCR 3.0.3</bdi>، يتضمن:</strong></h4>
<ul dir="rtl">
<li> تصحيح خلل: تم حل المشكلة التي لم يكن فيها معلمة <code>enable_mkldnn</code> فعّالة، واستعادة السلوك الافتراضي باستخدام MKL-DNN للاستدلال بوحدة المعالجة المركزية.</li>
</ul>
<h4 dir="rtl">🔥🔥<strong>2025.06.19: إصدار <bdi dir="ltr">PaddleOCR 3.0.2</bdi>، يتضمن:</strong></h4>
<ul dir="rtl">
<li><strong>ميزات جديدة:</strong>
<ul dir="rtl">
<li>تم تغيير مصدر التنزيل الافتراضي من <bdi dir="ltr"><code>BOS</code></bdi> إلى <bdi dir="ltr"><code>HuggingFace</code></bdi>. يمكن للمستخدمين أيضًا تغيير متغير البيئة <bdi dir="ltr"><code>PADDLE_PDX_MODEL_SOURCE</code></bdi> إلى <bdi dir="ltr"><code>BOS</code></bdi> لإعادة تعيين مصدر تنزيل النموذج إلى <bdi dir="ltr">Baidu Object Storage (BOS)</bdi>.</li>
<li>تمت إضافة أمثلة استدعاء الخدمة لست لغات — <bdi dir="ltr">C++</bdi>, <bdi dir="ltr">Java</bdi>, <bdi dir="ltr">Go</bdi>, <bdi dir="ltr">C#</bdi>, <bdi dir="ltr">Node.js</bdi>, و <bdi dir="ltr">PHP</bdi> — لخطوط الأنابيب مثل <bdi dir="ltr">PP-OCRv5</bdi>, <bdi dir="ltr">PP-StructureV3</bdi>, و <bdi dir="ltr">PP-ChatOCRv4</bdi>.</li>
<li>تحسين خوارزمية فرز تقسيم التخطيط في خط أنابيب <bdi dir="ltr">PP-StructureV3</bdi>، مما يعزز منطق الفرز للتخطيطات العمودية المعقدة لتقديم نتائج أفضل.</li>
<li>تحسين منطق اختيار النموذج: عند تحديد لغة وعدم تحديد إصدار النموذج، سيقوم النظام تلقائيًا بتحديد أحدث إصدار للنموذج يدعم تلك اللغة.</li>
<li>تعيين حد أعلى افتراضي لحجم ذاكرة التخزين المؤقت لـ <bdi dir="ltr">MKL-DNN</bdi> لمنع النمو غير المحدود، مع السماح للمستخدمين أيضًا بتكوين سعة ذاكرة التخزين المؤقت.</li>
<li>تحديث التكوينات الافتراضية للاستدلال عالي الأداء لدعم تسريع <bdi dir="ltr">Paddle MKL-DNN</bdi> وتحسين منطق الاختيار التلقائي للتكوين لخيارات أكثر ذكاءً.</li>
<li>تعديل منطق الحصول على الجهاز الافتراضي لمراعاة الدعم الفعلي لأجهزة الحوسبة بواسطة إطار عمل <bdi dir="ltr">Paddle</bdi> المثبت، مما يجعل سلوك البرنامج أكثر بديهية.</li>
<li>إضافة مثال <bdi dir="ltr">Android</bdi> لـ <bdi dir="ltr">PP-OCRv5</bdi>. <a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html">التفاصيل</a>.</li>
</ul>
</li>
<li><strong>إصلاحات الأخطاء:</strong>
<ul dir="rtl">
<li>إصلاح مشكلة عدم تفعيل بعض معلمات <bdi dir="ltr">CLI</bdi> في <bdi dir="ltr">PP-StructureV3</bdi>.</li>
<li>حل مشكلة حيث لا تعمل <bdi dir="ltr"><code>export_paddlex_config_to_yaml</code></bdi> بشكل صحيح في بعض الحالات.</li>
<li>تصحيح التناقض بين السلوك الفعلي لـ <bdi dir="ltr"><code>save_path</code></bdi> ووصفه في الوثائق.</li>
<li>إصلاح أخطاء تعدد الخيوط المحتملة عند استخدام <bdi dir="ltr">MKL-DNN</bdi> في نشر الخدمة الأساسية.</li>
<li>تصحيح أخطاء ترتيب القنوات في المعالجة المسبقة للصور لنموذج <bdi dir="ltr">Latex-OCR</bdi>.</li>
<li>إصلاح أخطاء ترتيب القنوات في حفظ الصور المرئية داخل وحدة التعرف على النص.</li>
<li>حل أخطاء ترتيب القنوات في نتائج الجداول المرئية داخل خط أنابيب <bdi dir="ltr">PP-StructureV3</bdi>.</li>
<li>إصلاح مشكلة تجاوز السعة في حساب <bdi dir="ltr"><code>overlap_ratio</code></bdi> في ظروف خاصة للغاية في خط أنابيب <bdi dir="ltr">PP-StructureV3</bdi>.</li>
</ul>
</li>
<li><strong>تحسينات على الوثائق:</strong>
<ul dir="rtl">
<li>تحديث وصف المعلمة <bdi dir="ltr"><code>enable_mkldnn</code></bdi> في الوثائق لتعكس بدقة السلوك الفعلي للبرنامج.</li>
<li>إصلاح الأخطاء في الوثائق المتعلقة بمعلمات <bdi dir="ltr"><code>lang</code></bdi> و <bdi dir="ltr"><code>ocr_version</code></bdi>.</li>
<li>إضافة تعليمات لتصدير ملفات تكوين خط الإنتاج عبر <bdi dir="ltr">CLI</bdi>.</li>
<li>إصلاح الأعمدة المفقودة في جدول بيانات أداء <bdi dir="ltr">PP-OCRv5</bdi>.</li>
<li>تحسين مقاييس الأداء لـ <bdi dir="ltr">PP-StructureV3</bdi> عبر تكوينات مختلفة.</li>
</ul>
</li>
<li><strong>أخرى:</strong>
<ul dir="rtl">
<li>تخفيف قيود الإصدار على التبعيات مثل <bdi dir="ltr">numpy</bdi> و <bdi dir="ltr">pandas</bdi>، واستعادة الدعم لـ <bdi dir="ltr">Python 3.12</bdi>.</li>
</ul>
</li>
</ul>
<details>
<summary dir="rtl"><strong>سجل التحديثات</strong></summary>
<h4 dir="rtl"><strong>🔥🔥 2025.06.05: إصدار <bdi dir="ltr">PaddleOCR 3.0.1</bdi>، يتضمن:</strong></h4>
<ul dir="rtl">
<li><strong>تحسين بعض النماذج وتكويناتها:</strong>
<ol dir="rtl">
<li>تحديث تكوين النموذج الافتراضي لـ <bdi dir="ltr">PP-OCRv5</bdi>، وتغيير كل من الكشف والتعرف من <bdi dir="ltr"><code>mobile</code></bdi> إلى <bdi dir="ltr"><code>server</code></bdi>. لتحسين الأداء الافتراضي في معظم السيناريوهات، تم تغيير المعلمة <bdi dir="ltr"><code>limit_side_len</code></bdi> في التكوين من 736 إلى 64.</li>
<li>إضافة نموذج جديد لتصنيف اتجاه أسطر النص <bdi dir="ltr"><code>PP-LCNet_x1_0_textline_ori</code></bdi> بدقة 99.42%. تم تحديث مصنف اتجاه أسطر النص الافتراضي لخطوط أنابيب <bdi dir="ltr">OCR</bdi> و <bdi dir="ltr">PP-StructureV3</bdi> و <bdi dir="ltr">PP-ChatOCRv4</bdi> إلى هذا النموذج.</li>
<li>تحسين نموذج تصنيف اتجاه أسطر النص <bdi dir="ltr"><code>PP-LCNet_x0_25_textline_ori</code></bdi>، مما أدى إلى تحسين الدقة بمقدار 3.3 نقطة مئوية لتصل إلى الدقة الحالية البالغة 98.85%.</li>
</ol>
<li><strong>تحسينات وإصلاحات لبعض المشكلات في الإصدار 3.0.0، <a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html">التفاصيل</a></strong></li>
</ul>
🔥🔥2025.05.20: الإصدار الرسمي لـ **PaddleOCR v3.0**، بما في ذلك:
<h4 dir="rtl"><bdi dir="ltr">PP-OCRv5</bdi>: نموذج التعرف على النصوص عالي الدقة لجميع السيناريوهات نص فوري من الصور/PDF.</h4>
<ol dir="rtl">
<li>🌐 دعم نموذج واحد **لخمسة** أنواع من النصوص - معالجة سلسة **للصينية المبسطة والصينية التقليدية وبينين الصينية المبسطة والإنجليزية** و**اليابانية** ضمن نموذج واحد.</li>
<li>✍️ تحسين **التعرف على الكتابة اليدوية**: أداء أفضل بشكل ملحوظ في النصوص المتصلة المعقدة والكتابة اليدوية غير القياسية.</li>
<li>🎯 **زيادة في الدقة بمقدار 13 نقطة** عن <bdi dir="ltr">PP-OCRv4</bdi>، مما يحقق أداءً على أحدث طراز في مجموعة متنوعة من سيناريوهات العالم الحقيقي.</li>
</ol>
#### <h4 dir="rtl"><bdi dir="ltr">PP-StructureV3</bdi>: تحليل المستندات للأغراض العامة أطلق العنان لتحليل الصور/PDFs بأحدث التقنيات لسيناريوهات العالم الحقيقي!</h4>
<ol dir="rtl">
<li>🧮 **تحليل PDF عالي الدقة متعدد السيناريوهات**، يتصدر كلاً من الحلول المفتوحة والمغلقة المصدر على معيار <bdi dir="ltr">OmniDocBench</bdi>.</li>
<li>🧠 تشمل القدرات المتخصصة **التعرف على الأختام**، **تحويل المخططات إلى جداول**، **التعرف على الجداول التي تحتوي على صيغ/صور متداخلة**، **تحليل المستندات ذات النصوص العمودية**، و**تحليل هياكل الجداول المعقدة**.</li>
</ol>
#### <h4 dir="rtl"><bdi dir="ltr">PP-ChatOCRv4</bdi>: فهم المستندات الذكي استخرج المعلومات الأساسية، وليس فقط النصوص من الصور/PDFs.</h4>
<ol dir="rtl">
<li>🔥 **زيادة في الدقة بمقدار 15 نقطة** في استخراج المعلومات الأساسية من ملفات <bdi dir="ltr">PDF/PNG/JPG</bdi> مقارنة بالجيل السابق.</li>
<li>💻 دعم أصلي لـ <bdi dir="ltr">ERNIE 4.5 Turbo</bdi>، مع التوافق مع عمليات نشر النماذج الكبيرة عبر <bdi dir="ltr">PaddleNLP</bdi> و <bdi dir="ltr">Ollama</bdi> و <bdi dir="ltr">vLLM</bdi> والمزيد.</li>
<li>🤝 دمج <a href="https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2" dir="ltr">PP-DocBee2</a>، مما يتيح استخراج وفهم النصوص المطبوعة والمخطوطة والأختام والجداول والمخططات والعناصر الشائعة الأخرى في المستندات المعقدة.</li>
</ol>
<p align="right">[<a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html">سجل التحديثات</a>]</p>
</details>
## ⚡ التشغيل السريع
### 1. تشغيل العرض التوضيحي عبر الإنترنت
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. التثبيت
قم بتثبيت PaddlePaddle بالرجوع إلى [دليل التثبيت](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)، وبعد ذلك، قم بتثبيت مجموعة أدوات PaddleOCR.
<div style="text-align: left;">
```bash
# Install paddleocr
pip install paddleocr
```
</div>
### 3. تشغيل الاستدلال عبر واجهة سطر الأوامر (CLI)
<div style="text-align: left !important;">
```bash
# Run PP-OCRv5 inference
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# Run PP-StructureV3 inference
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# Get the Qianfan API Key at first, and then run PP-ChatOCRv4 inference
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# Get more information about "paddleocr ocr"
paddleocr ocr --help
```
</div>
### 4. تشغيل الاستدلال عبر واجهة برمجة التطبيقات (API)
<details dir="ltr" open>
<summary dir="rtl"><strong>4.1 مثال PP-OCRv5</strong></summary>
```python
from paddleocr import PaddleOCR
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
</details>
<details>
<summary dir="rtl"><strong>4.2 مثال PP-StructureV3</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# للصور
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# عرض النتائج وحفظها بصيغة JSON
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary dir="rtl"><strong>4.3 مثال PP-ChatOCRv4</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# إذا تم استخدام نموذج كبير متعدد الوسائط، فيجب بدء خدمة mllm المحلية. يمكنك الرجوع إلى الوثائق: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md لتنفيذ النشر وتحديث تكوين mllm_chat_bot_config.
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
### 5. مسرّعات الذكاء الاصطناعي الصينية غير المتجانسة
<ul dir="rtl">
<li><a href="https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html">Huawei Ascend</a></li>
<li><a href="https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html">KUNLUNXIN</a></li>
</ul>
## ⛰️ دروس متقدمة
- [درس PP-OCRv5 التعليمي](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [درس PP-StructureV3 التعليمي](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [درس PP-ChatOCRv4 التعليمي](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 نظرة سريعة على نتائج التنفيذ
<p align="center">
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
</p>
<p align="center">
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
</p>
## 👩‍👩‍👧‍👦 المجتمع
| حساب PaddlePaddle الرسمي على WeChat | انضم إلى مجموعة النقاش التقني |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 😃 مشاريع رائعة تستخدم PaddleOCR
لم يكن PaddleOCR ليصل إلى ما هو عليه اليوم بدون مجتمعه المذهل! 💗 شكرًا جزيلاً لجميع شركائنا القدامى، والمتعاونين الجدد، وكل من صب شغفه في PaddleOCR - سواء ذكرنا اسمك أم لا. دعمكم يشعل نارنا!
| اسم المشروع | الوصف |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|محرك RAG يعتمد على فهم عميق للوثائق.|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|أداة تحويل المستندات متعددة الأنواع إلى Markdown|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|برنامج OCR مجاني ومفتوح المصدر للعمل دفعة واحدة دون اتصال بالإنترنت.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |أداة OmniParser: أداة تحليل الشاشة لوكيل واجهة المستخدم الرسومية المستند إلى الرؤية البحتة.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |نظام سؤال وجواب يعتمد على أي شيء.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|مجموعة أدوات قوية مفتوحة المصدر مصممة لاستخراج محتوى عالي الجودة بكفاءة من مستندات PDF المعقدة والمتنوعة.|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |يتعرف على النص على الشاشة، ويترجمه ويعرض نتائج الترجمة في الوقت الفعلي.|
| [تعرف على المزيد من المشاريع](./awesome_projects.md) | [مشاريع أخرى تعتمد على PaddleOCR](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 المساهمون
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 نجمة
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 الترخيص
هذا المشروع مرخص بموجب [ترخيص Apache 2.0](LICENSE).
</div>
## 🎓 الاستشهاد الأكاديمي
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1,344 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner_cn.png" alt="PaddleOCR Banner">
</p>
<!-- language -->
[English](./README.md) | 简体中文 | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.8~3.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 简介
PaddleOCR自发布以来凭借学术前沿算法和产业落地实践受到了产学研各方的喜爱并被广泛应用于众多知名开源项目例如Umi-OCR、OmniParser、MinerU、RAGFlow等已成为广大开发者心中的开源OCR领域的首选工具。2025年5月20日飞桨团队发布**PaddleOCR 3.0**,全面适配**飞桨框架3.0正式版**,进一步**提升文字识别精度**,支持**多文字类型识别**和**手写体识别**,满足大模型应用对**复杂文档高精度解析**的旺盛需求,结合**ERNIE 4.5 Turbo**显著提升关键信息抽取精度,并新增**对昆仑芯、昇腾等国产硬件**的支持。完整使用文档请参考 [PaddleOCR 3.0 文档](https://paddlepaddle.github.io/PaddleOCR/latest/)。
PaddleOCR 3.0**新增**三大特色能力:
- 全场景文字识别模型[PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代**提升13个百分点**。[在线体验](https://aistudio.baidu.com/community/app/91660/webUI)
- 通用文档解析方案[PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支持多场景、多版式 PDF 高精度解析,在公开评测集中**领先众多开源和闭源方案**。[在线体验](https://aistudio.baidu.com/community/app/518494/webUI)
- 智能文档理解方案[PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md)原生支持ERNIE 4.5 Turbo精度相比上一代**提升15个百分点**。[在线体验](https://aistudio.baidu.com/community/app/518493/webUI)
PaddleOCR 3.0除了提供优秀的模型库外还提供好学易用的工具覆盖模型训练、推理和服务化部署方便开发者快速落地AI应用。
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch_cn.png" alt="PaddleOCR Architecture">
</p>
</div>
## 📣 最新动态
🔥🔥2025.06.29: **PaddleOCR 3.1.0** 发布,新增能力如下:
- **重要模型和产线:**
- **新增 PP-OCRv5 多语种文本识别模型**,支持法语、西班牙语、葡萄牙语、俄语、韩语等 37 种语言的文字识别模型的训推流程。**平均精度涨幅超30%。**[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- 升级 PP-StructureV3 中的 **PP-Chart2Table 模型**图表转表能力进一步升级在内部自建测评集合上指标RMS-F1**提升 9.36 个百分点71.24% -> 80.60%)。**
- 新增基于 PP-StructureV3 和 ERNIE 4.5 Turbo 的**文档翻译产线 PP-DocTranslation支持翻译 Markdown 格式文档、各种复杂版式的 PDF 文档和文档图像,结果保存为 Markdown 格式文档。**[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-DocTranslation.html)
- **新增MCP server**[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/mcp_server.html)
- **支持 OCR 和 PP-StructureV3 两种工具;**
- 支持本地Python库、星河社区云服务、自托管服务三种工作模式
- 支持通过 stdio 调用本地服务,通过 Streamable HTTP 调用远程服务。
- **文档优化:** 优化了部分使用文档描述,提升阅读体验。
2025.06.26: **PaddleOCR 3.0.3** 发布,包含:
- Bug修复修复`enable_mkldnn`参数不生效的问题恢复CPU默认使用MKL-DNN推理的行为。
2025.06.19: **PaddleOCR 3.0.2** 发布,包含:
- **功能新增:**
- 模型默认下载源从`BOS`改为`HuggingFace`,同时也支持用户通过更改环境变量`PADDLE_PDX_MODEL_SOURCE`为`BOS`将模型下载源设置为百度云对象存储BOS。
- PP-OCRv5、PP-StructureV3、PP-ChatOCRv4等pipeline新增C++、Java、Go、C#、Node.js、PHP 6种语言的服务调用示例。
- 优化PP-StructureV3产线中版面分区排序算法对复杂竖版版面排序逻辑进行完善进一步提升了复杂版面排序效果。
- 优化模型选择逻辑,当指定语言、未指定模型版本时,自动选择支持该语言的最新版本的模型。
- 为MKL-DNN缓存大小设置默认上界防止缓存无限增长。同时支持用户配置缓存容量。
- 更新高性能推理默认配置支持Paddle MKL-DNN加速。优化高性能推理自动配置逻辑支持更智能的配置选择。
- 调整默认设备获取逻辑考虑环境中安装的Paddle框架对计算设备的实际支持情况使程序行为更符合直觉。
- 新增PP-OCRv5的Android端示例[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/on_device_deployment.html)。
- **Bug修复**
- 修复PP-StructureV3部分CLI参数不生效的问题。
- 修复部分情况下`export_paddlex_config_to_yaml`无法正常工作的问题。
- 修复save_path实际行为与文档描述不符的问题。
- 修复基础服务化部署在使用MKL-DNN时可能出现的多线程错误。
- 修复Latex-OCR模型的图像预处理的通道顺序错误。
- 修复文本识别模块保存可视化图像的通道顺序错误。
- 修复PP-StructureV3中表格可视化结果通道顺序错误。
- 修复PP-StructureV3产线中极特殊的情况下计算overlap_ratio时变量溢出问题。
- **文档优化:**
- 更新文档中对`enable_mkldnn`参数的说明,使其更准确地描述程序的实际行为。
- 修复文档中对`lang`和`ocr_version`参数描述的错误。
- 补充通过CLI导出产线配置文件的说明。
- 修复PP-OCRv5性能数据表格中的列缺失问题。
- 润色PP-StructureV3在不同配置下的benchmark指标。
- **其他:**
- 放松numpy、pandas等依赖的版本限制恢复对Python 3.12的支持。
<details>
<summary><strong>历史日志</strong></summary>
2025.06.05: **PaddleOCR 3.0.1** 发布,包含:
- **优化部分模型和模型配置:**
- 更新 PP-OCRv5默认模型配置检测和识别均由mobile改为server模型。为了改善大多数的场景默认效果配置中的参数`limit_side_len`由736改为64
- 新增文本行方向分类`PP-LCNet_x1_0_textline_ori`模型精度99.42%OCR、PP-StructureV3、PP-ChatOCRv4产线的默认文本行方向分类器改为该模型
- 优化文本行方向分类`PP-LCNet_x0_25_textline_ori`模型精度提升3.3个百分点当前精度98.85%
- **优化和修复3.0.0版本部分存在的问题,[详情](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)**
🔥🔥2025.05.20: **PaddleOCR 3.0** 正式发布,包含:
- **PP-OCRv5**: 全场景高精度文字识别
1. 🌐 单模型支持**五种**文字类型(**简体中文**、**繁体中文**、**中文拼音**、**英文**和**日文**)。
2. ✍️ 支持复杂**手写体**识别:复杂连笔、非规范字迹识别性能显著提升。
3. 🎯 整体识别精度提升 - 多种应用场景达到 SOTA 精度, 相比上一版本PP-OCRv4识别精度**提升13个百分点**
- **PP-StructureV3**: 通用文档解析方案
1. 🧮 支持多场景 PDF 高精度解析,在 OmniDocBench 基准测试中**领先众多开源和闭源方案**。
2. 🧠 多项专精能力: **印章识别**、**图表转表格**、**嵌套公式/图片的表格识别**、**竖排文本解析**及**复杂表格结构分析**等。
- **PP-ChatOCRv4**: 智能文档理解方案
1. 🔥 文档图像PDF/PNG/JPG关键信息提取精度相比上一代**提升15个百分点**
2. 💻 原生支持**ERNIE 4.5 Turbo**,还兼容 PaddleNLP、Ollama、vLLM 等工具部署的大模型。
3. 🤝 集成 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2),支持印刷文字、手写体文字、印章信息、表格、图表等常见的复杂文档信息抽取和理解的能力。
[更多日志](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)
</details>
## ⚡ 快速开始
### 1. 在线体验
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. 本地安装
请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)完成**PaddlePaddle 3.0**的安装然后安装paddleocr。
```bash
# 安装 paddleocr
pip install paddleocr
```
### 3. 命令行方式推理
```bash
# 运行 PP-OCRv5 推理
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# 运行 PP-StructureV3 推理
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# 运行 PP-ChatOCRv4 推理前需要先获得千帆API Key
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# 查看 "paddleocr ocr" 详细参数
paddleocr ocr --help
```
### 4. API方式推理
**4.1 PP-OCRv5 示例**
```python
from paddleocr import PaddleOCR
# 初始化 PaddleOCR 实例
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# 对示例图像执行 OCR 推理
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# 可视化结果并保存 json 结果
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 PP-StructureV3 示例</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# For Image
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# 可视化结果并保存 json 结果
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 PP-ChatOCRv4 示例</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# 如果使用多模态大模型,需要启动本地 mllm 服务可以参考文档https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md 进行部署,并更新 mllm_chat_bot_config 配置。
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
### 5. **国产化硬件使用**
- [昆仑芯安装指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
- [昇腾安装指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
## ⛰️ 进阶指南
- [PP-OCRv5 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [PP-StructureV3 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [PP-ChatOCRv4 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 效果展示
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
</p>
</div>
## 👩‍👩‍👧‍👦 开发者社区
| 扫码关注飞桨公众号 | 扫码加入技术交流群 |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 🏆 使用 PaddleOCR 的优秀项目
PaddleOCR 的发展离不开社区贡献!💗衷心感谢所有开发者、合作伙伴与贡献者!
| 项目名称 | 简介 |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|基于RAG的AI工作流引擎|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|多类型文档转换Markdown工具|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|开源批量离线OCR软件|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |基于纯视觉的GUI智能体屏幕解析工具|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |基于任意内容的问答系统|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|高效复杂PDF文档提取工具包|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |屏幕实时翻译工具|
| [更多项目](./awesome_projects.md) | |
## 👩‍👩‍👧‍👦 贡献者
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 许可协议
本项目的发布受[Apache 2.0 license](LICENSE)许可认证。
## 🎓 学术引用
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1,343 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="Banner de PaddleOCR">
</p>
<!-- language -->
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | Español | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.83.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 Introducción
Desde su lanzamiento inicial, PaddleOCR ha sido ampliamente aclamado en las comunidades académica, industrial y de investigación, gracias a sus algoritmos de vanguardia y su rendimiento probado en aplicaciones del mundo real. Ya está impulsando proyectos populares de código abierto como Umi-OCR, OmniParser, MinerU y RAGFlow, convirtiéndose en el conjunto de herramientas de OCR de referencia para desarrolladores de todo el mundo.
El 20 de mayo de 2025, el equipo de PaddlePaddle presentó PaddleOCR 3.0, totalmente compatible con la versión oficial del framework **PaddlePaddle 3.0**. Esta actualización **aumenta aún más la precisión en el reconocimiento de texto**, añade soporte para el **reconocimiento de múltiples tipos de texto** y el **reconocimiento de escritura a mano**, y satisface la creciente demanda de las aplicaciones de grandes modelos para el **análisis (parsing) de alta precisión de documentos complejos**. En combinación con **ERNIE 4.5 Turbo**, mejora significativamente la precisión en la extracción de información clave. Para la documentación de uso completa, consulte la [Documentación de PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
Tres nuevas características principales en PaddleOCR 3.0:
- Modelo de Reconocimiento de Texto en Escenarios Universales [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): Un único modelo que maneja cinco tipos de texto diferentes además de escritura a mano compleja. La precisión general de reconocimiento ha aumentado en 13 puntos porcentuales con respecto a la generación anterior. [Demo en línea](https://aistudio.baidu.com/community/app/91660/webUI)
- Solución de Análisis General de Documentos [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Ofrece un análisis de alta precisión de PDF con múltiples diseños y escenas, superando a muchas soluciones de código abierto y cerrado en benchmarks públicos. [Demo en línea](https://aistudio.baidu.com/community/app/518494/webUI)
- Solución de Comprensión Inteligente de Documentos [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Impulsado nativamente por el gran modelo ERNIE 4.5 Turbo, logrando una precisión 15 puntos porcentuales mayor que su predecesor. [Demo en línea](https://aistudio.baidu.com/community/app/518493/webUI)
Además de proporcionar una excelente biblioteca de modelos, PaddleOCR 3.0 también ofrece herramientas fáciles de usar que cubren el entrenamiento de modelos, la inferencia y el despliegue de servicios, para que los desarrolladores puedan llevar rápidamente las aplicaciones de IA a producción.
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch.png" alt="Arquitectura de PaddleOCR">
</p>
</div>
## 📣 Últimas actualizaciones
#### **2025.06.29: Lanzamiento de PaddleOCR 3.1.0**, incluye:
- **Modelos y flujos de trabajo clave:**
- **Añadido el modelo de reconocimiento de texto multilingüe PP-OCRv5**, que soporta entrenamiento e inferencia para modelos de reconocimiento de texto en 37 idiomas, incluidos francés, español, portugués, ruso, coreano, etc. **Precisión media mejorada en más de un 30%.** [Detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- Actualizado el **modelo PP-Chart2Table** en PP-StructureV3, mejorando aún más la conversión de gráficos a tablas. En conjuntos de evaluación personalizados internos, la métrica (RMS-F1) **aumentó 9,36 puntos porcentuales (71,24% -> 80,60%).**
- Nuevo **flujo de traducción de documentos, PP-DocTranslation, basado en PP-StructureV3 y ERNIE 4.5 Turbo**, que soporta la traducción de documentos en formato Markdown, diversos PDF de diseño complejo e imágenes de documentos, guardando los resultados en formato Markdown. [Detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
- **Nuevo servidor MCP:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
- **Admite tanto OCR como los flujos de trabajo de PP-StructureV3.**
- Soporta tres modos de trabajo: biblioteca local de Python, servicio en la nube de la comunidad AIStudio y servicio autohospedado.
- Permite invocar servicios locales a través de stdio y servicios remotos a través de Streamable HTTP.
- **Optimización de la documentación:** Se han mejorado las descripciones en algunas guías de usuario para una experiencia de lectura más fluida.
#### 🔥🔥**2025.06.26: Lanzamiento de PaddleOCR 3.0.3, incluye:**
- Corrección de error: Se resolvió el problema donde el parámetro `enable_mkldnn` no era efectivo, restaurando el comportamiento predeterminado de usar MKL-DNN para la inferencia en CPU.
#### 🔥🔥**2025.06.19: Lanzamiento de PaddleOCR 3.0.2, incluye:**
- **Nuevas características:**
- La fuente de descarga predeterminada se ha cambiado de `BOS` a `HuggingFace`. Los usuarios también pueden cambiar la variable de entorno `PADDLE_PDX_MODEL_SOURCE` a `BOS` para volver a establecer la fuente de descarga del modelo en Baidu Object Storage (BOS).
- Se agregaron ejemplos de invocación de servicios para seis idiomas (C++, Java, Go, C#, Node.js y PHP) para pipelines como PP-OCRv5, PP-StructureV3 y PP-ChatOCRv4.
- Se mejoró el algoritmo de ordenación de particiones de diseño en el pipeline PP-StructureV3, mejorando la lógica de ordenación para diseños verticales complejos para ofrecer mejores resultados.
- Lógica de selección de modelo mejorada: cuando se especifica un idioma pero no una versión del modelo, el sistema seleccionará automáticamente la última versión del modelo que admita ese idioma.
- Se estableció un límite superior predeterminado para el tamaño de la caché de MKL-DNN para evitar un crecimiento ilimitado, al tiempo que se permite a los usuarios configurar la capacidad de la caché.
- Se actualizaron las configuraciones predeterminadas para la inferencia de alto rendimiento para admitir la aceleración de Paddle MKL-DNN y se optimizó la lógica para la selección automática de configuración para elecciones más inteligentes.
- Se ajustó la lógica para obtener el dispositivo predeterminado para considerar el soporte real de los dispositivos de computación por parte del framework Paddle instalado, lo que hace que el comportamiento del programa sea más intuitivo.
- Añadido ejemplo de Android para PP-OCRv5. [Detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
- **Corrección de errores:**
- Se solucionó un problema con algunos parámetros de CLI en PP-StructureV3 que no tenían efecto.
- Se resolvió un problema por el cual `export_paddlex_config_to_yaml` no funcionaba correctamente en ciertos casos.
- Se corrigió la discrepancia entre el comportamiento real de `save_path` y la descripción de su documentación.
- Se corrigieron posibles errores de subprocesos múltiples al usar MKL-DNN en la implementación de servicios básicos.
- Se corrigieron errores en el orden de los canales en el preprocesamiento de imágenes para el modelo Latex-OCR.
- Se corrigieron errores en el orden de los canales al guardar imágenes visualizadas dentro del módulo de reconocimiento de texto.
- Se resolvieron errores de orden de canales en los resultados de tablas visualizadas dentro del pipeline de PP-StructureV3.
- Se solucionó un problema de desbordamiento en el cálculo de `overlap_ratio` en circunstancias extremadamente especiales en el pipeline PP-StructureV3.
- **Mejoras en la documentación:**
- Se actualizó la descripción del parámetro `enable_mkldnn` en la documentación para reflejar con precisión el comportamiento real del programa.
- Se corrigieron errores en la documentación con respecto a los parámetros `lang` y `ocr_version`.
- Se agregaron instrucciones para exportar archivos de configuración de la línea de producción a través de CLI.
- Se corrigieron las columnas que faltaban en la tabla de datos de rendimiento para PP-OCRv5.
- Se refinaron las métricas de referencia para PP-StructureV3 en diferentes configuraciones.
- **Otros:**
- Se flexibilizaron las restricciones de versión en dependencias como numpy y pandas, restaurando el soporte para Python 3.12.
<details>
<summary><strong>Historial de actualizaciones</strong></summary>
#### **🔥🔥 2025.06.05: Lanzamiento de PaddleOCR 3.0.1, incluye:**
- **Optimización de ciertos modelos y configuraciones de modelos:**
- Actualizada la configuración de modelo por defecto para PP-OCRv5, cambiando tanto la detección como el reconocimiento de modelos `mobile` a `server`. Para mejorar el rendimiento por defecto en la mayoría de los escenarios, el parámetro `limit_side_len` en la configuración ha sido cambiado de 736 a 64.
- Añadido un nuevo modelo de clasificación de orientación de línea de texto `PP-LCNet_x1_0_textline_ori` con una precisión del 99.42%. El clasificador de orientación de línea de texto por defecto para los pipelines de OCR, PP-StructureV3 y PP-ChatOCRv4 ha sido actualizado a este modelo.
- Optimizado el modelo de clasificación de orientación de línea de texto `PP-LCNet_x0_25_textline_ori`, mejorando la precisión en 3.3 puntos porcentuales hasta una precisión actual del 98.85%.
- **Optimizaciones y correcciones de algunos problemas en la versión 3.0.0, [detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
🔥🔥2025.05.20: Lanzamiento oficial de **PaddleOCR v3.0**, incluyendo:
- **PP-OCRv5**: Modelo de Reconocimiento de Texto de Alta Precisión para Todos los Escenarios - Texto Instantáneo desde Imágenes/PDFs.
1. 🌐 Soporte en un único modelo para **cinco** tipos de texto - Procese sin problemas **Chino Simplificado, Chino Tradicional, Pinyin de Chino Simplificado, Inglés** y **Japonés** dentro de un solo modelo.
2. ✍️ **Reconocimiento de escritura a mano** mejorado: Significativamente mejor en escritura cursiva compleja y caligrafía no estándar.
3. 🎯 **Ganancia de precisión de 13 puntos** sobre PP-OCRv4, alcanzando un rendimiento de vanguardia (state-of-the-art) en una variedad de escenarios del mundo real.
- **PP-StructureV3**: Solución de Análisis de Documentos de Propósito General ¡Libere el poder del análisis SOTA de Imágenes/PDFs para escenarios del mundo real!
1. 🧮 **Análisis de PDF multiescena de alta precisión**, liderando tanto a las soluciones de código abierto como a las de código cerrado en el benchmark OmniDocBench.
2. 🧠 Capacidades especializadas que incluyen **reconocimiento de sellos**, **conversión de gráficos a tablas**, **reconocimiento de tablas con fórmulas/imágenes anidadas**, **análisis de documentos de texto vertical** y **análisis de estructuras de tablas complejas**.
- **PP-ChatOCRv4**: Solución Inteligente de Comprensión de Documentos Extraiga Información Clave, no solo texto de Imágenes/PDFs.
1. 🔥 **Ganancia de precisión de 15 puntos** en la extracción de información clave en archivos PDF/PNG/JPG con respecto a la generación anterior.
2. 💻 Soporte nativo para **ERNIE 4.5 Turbo**, con compatibilidad para despliegues de modelos grandes a través de PaddleNLP, Ollama, vLLM y más.
3. 🤝 Integrado con [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), permitiendo la extracción y comprensión de texto impreso, escritura a mano, sellos, tablas, gráficos y otros elementos comunes en documentos complejos.
[Historial de actualizaciones](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
</details>
## ⚡ Inicio rápido
### 1. Ejecutar demo en línea
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. Instalación
Instale PaddlePaddle consultando la [Guía de Instalación](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), y después, instale el toolkit de PaddleOCR.
```bash
# Instalar paddleocr
pip install paddleocr
```
### 3. Ejecutar inferencia por CLI
```bash
# Ejecutar inferencia de PP-OCRv5
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# Ejecutar inferencia de PP-StructureV3
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# Obtenga primero la API Key de Qianfan y luego ejecute la inferencia de PP-ChatOCRv4
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# Obtener más información sobre "paddleocr ocr"
paddleocr ocr --help
```
### 4. Ejecutar inferencia por API
**4.1 Ejemplo de PP-OCRv5**
```python
from paddleocr import PaddleOCR
# Inicializar la instancia de PaddleOCR
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# Ejecutar inferencia de OCR en una imagen de ejemplo
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# Visualizar los resultados y guardar los resultados en JSON
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 Ejemplo de PP-StructureV3</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# Para Imagen
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# Visualizar los resultados y guardar los resultados en JSON
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 Ejemplo de PP-ChatOCRv4</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # su api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # su api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# Si se utiliza un modelo grande multimodal, es necesario iniciar el servicio mllm local. Puede consultar la documentación: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md para realizar el despliegue y actualizar la configuración de mllm_chat_bot_config.
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # la URL de su servicio mllm local
"api_type": "openai",
"api_key": "api_key", # su api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
## ⛰️ Tutoriales avanzados
- [Tutorial de PP-OCRv5](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [Tutorial de PP-StructureV3](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [Tutorial de PP-ChatOCRv4](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 Vista rápida de los resultados de ejecución
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="Demo de PP-OCRv5">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="Demo de PP-StructureV3">
</p>
</div>
## 👩‍👩‍👧‍👦 Comunidad
| Cuenta oficial de PaddlePaddle en WeChat | Únase al grupo de discusión técnica |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 😃 Proyectos increíbles que aprovechan PaddleOCR
¡PaddleOCR no estaría donde está hoy sin su increíble comunidad! 💗 Un enorme agradecimiento a todos nuestros socios de siempre, nuevos colaboradores y a todos los que han volcado su pasión en PaddleOCR, ya sea que los hayamos nombrado o no. ¡Su apoyo alimenta nuestro fuego!
| Nombre del Proyecto | Descripción |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|Motor de RAG basado en la comprensión profunda de documentos.|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Herramienta de conversión de documentos de múltiples tipos a Markdown.|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Software de OCR por lotes, sin conexión, gratuito y de código abierto.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Herramienta de análisis de pantalla para agentes GUI basados puramente en visión.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Preguntas y respuestas basadas en cualquier cosa.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|Un potente toolkit de código abierto diseñado para extraer eficientemente contenido de alta calidad de documentos PDF complejos y diversos.|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |Reconoce texto en la pantalla, lo traduce y muestra los resultados de la traducción en tiempo real.|
| [Conozca más proyectos](./awesome_projects.md) | [Más proyectos basados en PaddleOCR](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 Contribuidores
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 Licencia
Este proyecto se publica bajo la [licencia Apache 2.0](LICENSE).
## 🎓 Citación
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}

View File

@ -0,0 +1,342 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="Bannière PaddleOCR">
</p>
<!-- language -->
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | Français | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.8~3.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 Introduction
Depuis sa sortie initiale, PaddleOCR a été largement acclamé par les milieux universitaires, industriels et de la recherche, grâce à ses algorithmes de pointe et à ses performances éprouvées dans des applications réelles. Il alimente déjà des projets open-source populaires tels que Umi-OCR, OmniParser, MinerU et RAGFlow, ce qui en fait la boîte à outils OCR de référence pour les développeurs du monde entier.
Le 20 mai 2025, l'équipe de PaddlePaddle a dévoilé PaddleOCR 3.0, entièrement compatible avec la version officielle du framework **PaddlePaddle 3.0**. Cette mise à jour **améliore encore la précision de la reconnaissance de texte**, ajoute la prise en charge de la **reconnaissance de multiples types de texte** et de la **reconnaissance de l'écriture manuscrite**, et répond à la demande croissante des applications de grands modèles pour l'**analyse de haute précision de documents complexes**. Combiné avec **ERNIE 4.5 Turbo**, il améliore considérablement la précision de l'extraction d'informations clés. Pour la documentation d'utilisation complète, veuillez vous référer à la [Documentation de PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
Trois nouvelles fonctionnalités majeures dans PaddleOCR 3.0 :
- Modèle de reconnaissance de texte pour toutes scènes [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md) : Un modèle unique qui gère cinq types de texte différents ainsi que l'écriture manuscrite complexe. La précision globale de la reconnaissance a augmenté de 13 points de pourcentage par rapport à la génération précédente. [Démo en ligne](https://aistudio.baidu.com/community/app/91660/webUI)
- Solution d'analyse de documents générique [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md) : Fournit une analyse de haute précision des PDF multi-mises en page et multi-scènes, surpassant de nombreuses solutions open-source et propriétaires sur les benchmarks publics. [Démo en ligne](https://aistudio.baidu.com/community/app/518494/webUI)
- Solution de compréhension de documents intelligente [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md) : Nativement propulsé par le grand modèle ERNIE 4.5 Turbo, atteignant une précision supérieure de 15 points de pourcentage à celle de son prédécesseur. [Démo en ligne](https://aistudio.baidu.com/community/app/518493/webUI)
En plus de fournir une bibliothèque de modèles exceptionnelle, PaddleOCR 3.0 propose également des outils conviviaux couvrant l'entraînement de modèles, l'inférence et le déploiement de services, afin que les développeurs puissent rapidement mettre en production des applications d'IA.
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch.png" alt="Architecture de PaddleOCR">
</p>
</div>
## 📣 Mises à jour récentes
#### **29/06/2025 : Sortie de PaddleOCR 3.1.0**, comprend :
- **Modèles et pipelines principaux :**
- **Ajout du modèle de reconnaissance de texte multilingue PP-OCRv5**, prenant en charge l'entraînement et l'inférence pour 37 langues, dont le français, l'espagnol, le portugais, le russe, le coréen, etc. **Précision moyenne améliorée de plus de 30 %.** [Détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- Mise à niveau du **modèle PP-Chart2Table** dans PP-StructureV3, améliorant davantage la conversion des graphiques en tableaux. Sur des ensembles d'évaluation internes personnalisés, la métrique (RMS-F1) **a augmenté de 9,36 points de pourcentage (71,24 % -> 80,60 %).**
- Lancement du **pipeline de traduction de documents, PP-DocTranslation, basé sur PP-StructureV3 et ERNIE 4.5 Turbo**, prenant en charge la traduction des documents au format Markdown, des PDF à mise en page complexe, et des images de documents, avec sauvegarde des résultats au format Markdown. [Détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
- **Nouveau serveur MCP :** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
- **Prend en charge les pipelines OCR et PP-StructureV3.**
- Prend en charge trois modes de fonctionnement : bibliothèque Python locale, service cloud communautaire AIStudio et service auto-hébergé.
- Prend en charge l'appel des services locaux via stdio et des services distants via Streamable HTTP.
- **Optimisation de la documentation :** Amélioration des descriptions dans certains guides utilisateurs pour une expérience de lecture plus fluide.
#### **26/06/2025 : Publication de PaddleOCR 3.0.3, incluant :**
- Correction de bug : Résolution du problème où le paramètre `enable_mkldnn` ne fonctionnait pas, rétablissant le comportement par défaut d'utilisation de MKL-DNN pour l'inférence CPU.
#### 🔥🔥**19/06/2025 : Publication de PaddleOCR 3.0.2, incluant :**
- **Nouvelles fonctionnalités :**
- La source de téléchargement par défaut a été changée de `BOS` à `HuggingFace`. Les utilisateurs peuvent également changer la variable d'environnement `PADDLE_PDX_MODEL_SOURCE` en `BOS` pour rétablir la source de téléchargement sur Baidu Object Storage (BOS).
- Ajout d'exemples d'appel de service pour six langues — C++, Java, Go, C#, Node.js et PHP — pour les pipelines tels que PP-OCRv5, PP-StructureV3 et PP-ChatOCRv4.
- Amélioration de l'algorithme de tri de partition de mise en page dans le pipeline PP-StructureV3, améliorant la logique de tri pour les mises en page verticales complexes afin de fournir de meilleurs résultats.
- Logique de sélection de modèle améliorée : lorsqu'une langue est spécifiée mais pas une version de modèle, le système sélectionnera automatiquement la dernière version du modèle prenant en charge cette langue.
- Définition d'une limite supérieure par défaut pour la taille du cache MKL-DNN afin d'éviter une croissance illimitée, tout en permettant aux utilisateurs de configurer la capacité du cache.
- Mise à jour des configurations par défaut pour l'inférence haute performance afin de prendre en charge l'accélération Paddle MKL-DNN et optimisation de la logique de sélection automatique de la configuration pour des choix plus intelligents.
- Ajustement de la logique d'obtention du périphérique par défaut pour tenir compte du support réel des dispositifs de calcul par le framework Paddle installé, rendant le comportement du programme plus intuitif.
- Ajout d'un exemple Android pour PP-OCRv5. [Détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
- **Corrections de bugs :**
- Correction d'un problème où certains paramètres CLI dans PP-StructureV3 ne prenaient pas effet.
- Résolution d'un problème où `export_paddlex_config_to_yaml` ne fonctionnait pas correctement dans certains cas.
- Correction de l'écart entre le comportement réel de `save_path` et sa description dans la documentation.
- Correction d'erreurs potentielles de multithreading lors de l'utilisation de MKL-DNN dans le déploiement de services de base.
- Correction des erreurs d'ordre des canaux dans le prétraitement des images pour le modèle Latex-OCR.
- Correction des erreurs d'ordre des canaux lors de la sauvegarde des images visualisées dans le module de reconnaissance de texte.
- Résolution des erreurs d'ordre des canaux dans les résultats de tableaux visualisés dans le pipeline PP-StructureV3.
- Correction d'un problème de débordement dans le calcul de `overlap_ratio` dans des circonstances très spéciales dans le pipeline PP-StructureV3.
- **Améliorations de la documentation :**
- Mise à jour de la description du paramètre `enable_mkldnn` dans la documentation pour refléter précisément le comportement réel du programme.
- Correction d'erreurs dans la documentation concernant les paramètres `lang` et `ocr_version`.
- Ajout d'instructions pour l'exportation des fichiers de configuration de la ligne de production via CLI.
- Correction des colonnes manquantes dans le tableau de données de performance pour PP-OCRv5.
- Affinement des métriques de benchmark pour PP-StructureV3 pour différentes configurations.
- **Autres :**
- Assouplissement des restrictions de version sur les dépendances comme numpy et pandas, restaurant la prise en charge de Python 3.12.
<details>
<summary><strong>Historique des mises à jour</strong></summary>
#### **🔥🔥 05/06/2025 : Publication de PaddleOCR 3.0.1, incluant :**
- **Optimisation de certains modèles et de leurs configurations :**
- Mise à jour de la configuration par défaut du modèle pour PP-OCRv5, en passant les modèles de détection et de reconnaissance de `mobile` à `server`. Pour améliorer les performances par défaut dans la plupart des scénarios, le paramètre `limit_side_len` dans la configuration a été changé de 736 à 64.
- Ajout d'un nouveau modèle de classification de l'orientation des lignes de texte `PP-LCNet_x1_0_textline_ori` avec une précision de 99.42%. Le classifieur d'orientation de ligne de texte par défaut pour les pipelines OCR, PP-StructureV3 et PP-ChatOCRv4 a été mis à jour vers ce modèle.
- Optimisation du modèle de classification de l'orientation des lignes de texte `PP-LCNet_x0_25_textline_ori`, améliorant la précision de 3,3 points de pourcentage pour atteindre une précision actuelle de 98,85%.
- **Optimisations et corrections de certains problèmes de la version 3.0.0, [détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
🔥🔥20/05/2025 : Lancement officiel de **PaddleOCR v3.0**, incluant :
- **PP-OCRv5** : Modèle de reconnaissance de texte de haute précision pour tous les scénarios - Texte instantané à partir d'images/PDF.
1. 🌐 Prise en charge par un seul modèle de **cinq** types de texte - Traitez de manière transparente le **chinois simplifié, le chinois traditionnel, le pinyin chinois simplifié, l'anglais** et le **japonais** au sein d'un seul modèle.
2. ✍️ **Reconnaissance de l'écriture manuscrite** améliorée : Nettement plus performant sur les écritures cursives complexes et non standard.
3. 🎯 **Gain de précision de 13 points** par rapport à PP-OCRv4, atteignant des performances de pointe dans une variété de scénarios réels.
- **PP-StructureV3** : Analyse de documents à usage général Libérez une analyse d'images/PDF de pointe pour des scénarios du monde réel !
1. 🧮 **Analyse de PDF multi-scènes de haute précision**, devançant les solutions open-source et propriétaires sur le benchmark OmniDocBench.
2. 🧠 Les capacités spécialisées incluent la **reconnaissance de sceaux**, la **conversion de graphiques en tableaux**, la **reconnaissance de tableaux avec formules/images imbriquées**, l'**analyse de documents à texte vertical** et l'**analyse de structures de tableaux complexes**.
- **PP-ChatOCRv4** : Compréhension intelligente de documents Extrayez des informations clés, pas seulement du texte, à partir d'images/PDF.
1. 🔥 **Gain de précision de 15 points** dans l'extraction d'informations clés sur les fichiers PDF/PNG/JPG par rapport à la génération précédente.
2. 💻 Prise en charge native de **ERNIE 4.5 Turbo**, avec une compatibilité pour les déploiements de grands modèles via PaddleNLP, Ollama, vLLM, et plus encore.
3. 🤝 Intégration de [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), permettant l'extraction et la compréhension de texte imprimé, d'écriture manuscrite, de sceaux, de tableaux, de graphiques et d'autres éléments courants dans les documents complexes.
[Historique des mises à jour](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
</details>
## ⚡ Démarrage Rapide
### 1. Lancer la démo en ligne
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. Installation
Installez PaddlePaddle en vous référant au [Guide d'installation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), puis installez la boîte à outils PaddleOCR.
```bash
# Installer paddleocr
pip install paddleocr
```
### 3. Exécuter l'inférence par CLI
```bash
# Exécuter l'inférence PP-OCRv5
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# Exécuter l'inférence PP-StructureV3
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# Obtenez d'abord la clé API Qianfan, puis exécutez l'inférence PP-ChatOCRv4
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# Obtenir plus d'informations sur "paddleocr ocr"
paddleocr ocr --help
```
### 4. Exécuter l'inférence par API
**4.1 Exemple PP-OCRv5**
```python
# Initialiser l'instance de PaddleOCR
from paddleocr import PaddleOCR
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# Exécuter l'inférence OCR sur un exemple d'image
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# Visualiser les résultats et sauvegarder les résultats JSON
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 Exemple PP-StructureV3</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# Pour une image
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# Visualiser les résultats et sauvegarder les résultats JSON
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 Exemple PP-ChatOCRv4</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # votre api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # votre api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# Si un grand modèle multimodal est utilisé, le service mllm local doit être démarré. Vous pouvez vous référer à la documentation : https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md pour effectuer le déploiement et mettre à jour la configuration mllm_chat_bot_config.
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # url de votre service mllm local
"api_type": "openai",
"api_key": "api_key", # votre api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
## ⛰️ Tutoriels avancés
- [Tutoriel PP-OCRv5](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [Tutoriel PP-StructureV3](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [Tutoriel PP-ChatOCRv4](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 Aperçu rapide des résultats d'exécution
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="Démo PP-OCRv5">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="Démo PP-StructureV3">
</p>
</div>
## 👩‍👩‍👧‍👦 Communauté
| Compte officiel WeChat de PaddlePaddle | Rejoignez le groupe de discussion technique |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 😃 Projets formidables utilisant PaddleOCR
PaddleOCR ne serait pas là où il est aujourd'hui sans son incroyable communauté ! 💗 Un immense merci à tous nos partenaires de longue date, nos nouveaux collaborateurs, et tous ceux qui ont mis leur passion dans PaddleOCR — que nous vous ayons nommés ou non. Votre soutien nous anime !
| Nom du projet | Description |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|Moteur RAG basé sur la compréhension profonde des documents.|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Outil de conversion de documents multi-types en Markdown|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Logiciel d'OCR hors ligne, gratuit, open-source et par lots.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a>|Outil d'analyse d'écran pour agent GUI basé sur la vision pure.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a>|Questions et réponses basées sur n'importe quel contenu.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|Une puissante boîte à outils open-source conçue pour extraire efficacement du contenu de haute qualité à partir de documents PDF complexes et diversifiés.|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a>|Reconnaît le texte à l'écran, le traduit et affiche les résultats de la traduction en temps réel.|
| [En savoir plus](./awesome_projects.md) | [Plus de projets basés sur PaddleOCR](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 Contributeurs
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 Licence
Ce projet est publié sous la [licence Apache 2.0](LICENSE).
## 🎓 Citation
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1,340 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR Banner">
</p>
<!-- language -->
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | 日本語 | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.83.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 概要
PaddleOCRは、その最先端のアルゴリズムと実世界での応用実績により、初回リリース以来、学術界、産業界、研究コミュニティから広く支持を得ています。Umi-OCR、OmniParser、MinerU、RAGFlowなどの人気オープンソースプロジェクトで既に採用されており、世界中の開発者にとって定番のOCRツールキットとなっています。
2025年5月20日、PaddlePaddleチームは**PaddlePaddle 3.0**フレームワークの公式リリースに完全対応したPaddleOCR 3.0を発表しました。このアップデートでは、**テキスト認識精度**がさらに向上し、**複数テキストタイプの認識**と**手書き文字認識**がサポートされ、大規模モデルアプリケーションからの**複雑なドキュメントの高精度解析**に対する高まる需要に応えます。**ERNIE 4.5 Turbo**と組み合わせることで、キー情報抽出の精度が大幅に向上します。完全な使用方法については、[PaddleOCR 3.0 ドキュメント](https://paddlepaddle.github.io/PaddleOCR/latest/ja/index.html) をご参照ください。
PaddleOCR 3.0の3つの主要な新機能
- 全シーン対応テキスト認識モデル [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): 1つのモデルで5つの異なるテキストタイプと複雑な手書き文字を処理。全体の認識精度は前世代に比べて13パーセントポイント向上。[オンラインデモ](https://aistudio.baidu.com/community/app/91660/webUI)
- 汎用ドキュメント解析ソリューション [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): 複数レイアウト、複数シーンのPDFの高精度解析を実現し、公開ベンチマークで多くのオープンソースおよびクローズドソースのソリューションを凌駕。[オンラインデモ](https://aistudio.baidu.com/community/app/518494/webUI)
- インテリジェントドキュメント理解ソリューション [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): ERNIE 4.5 Turboにネイティブで対応し、前世代よりも15パーセントポイント高い精度を達成。[オンラインデモ](https://aistudio.baidu.com/community/app/518493/webUI)
PaddleOCR 3.0は、優れたモデルライブラリを提供するだけでなく、モデルのトレーニング、推論、サービス展開をカバーする使いやすいツールも提供しており、開発者がAIアプリケーションを迅速に本番環境に導入できるよう支援します。
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR Architecture">
</p>
</div>
## 📣 最近のアップデート
#### **2025.06.29PaddleOCR 3.1.0 をリリース**、内容は以下の通りです:
- **主なモデルとパイプライン:**
- **PP-OCRv5 多言語テキスト認識モデルを追加**、フランス語、スペイン語、ポルトガル語、ロシア語、韓国語など 37 言語に対応。**平均精度が 30%以上向上。** [詳細](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- PP-StructureV3 の **PP-Chart2Table モデル**をアップグレードし、グラフから表への変換能力をさらに強化。社内カスタム評価セットでは、指標RMS-F1が **9.36 ポイント向上71.24% → 80.60%)。**
- PP-StructureV3 および ERNIE 4.5 Turbo に基づく**ドキュメント翻訳パイプライン PP-DocTranslation**を新たに追加。Markdown 形式ドキュメント、さまざまな複雑レイアウトの PDF ドキュメント、ドキュメント画像の翻訳に対応し、結果を Markdown 形式で保存可能。[詳細](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
- **新しい MCP サーバー:**[Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
- **OCR と PP-StructureV3 パイプラインの両方をサポートします。**
- ローカル Python ライブラリ、AIStudio コミュニティクラウドサービス、セルフホストサービスの3つの動作モードをサポートします。
- stdio を介してローカルサービスを呼び出し、Streamable HTTP を介してリモートサービスを呼び出すことができます。
- **ドキュメント最適化:** 一部のユーザーガイドの説明を改善し、よりスムーズな読書体験を提供。
#### 🔥🔥**2025.06.26: PaddleOCR 3.0.3のリリース、以下の内容を含みます:**
- バグ修正:`enable_mkldnn`パラメータが機能しない問題を修正し、CPUがデフォルトでMKL-DNN推論を使用する動作を復元しました。
#### 🔥🔥**2025.06.19: PaddleOCR 3.0.2のリリース、以下の内容を含みます:**
- **新機能:**
- デフォルトのダウンロード元が`BOS`から`HuggingFace`に変更されました。ユーザーは環境変数 `PADDLE_PDX_MODEL_SOURCE``BOS` に変更することで、モデルのダウンロード元をBaidu Object Storage (BOS)に戻すこともできます。
- PP-OCRv5、PP-StructureV3、PP-ChatOCRv4などのパイプラインに、C++、Java、Go、C#、Node.js、PHPの6言語のサービス呼び出し例を追加しました。
- PP-StructureV3パイプラインのレイアウト分割ソートアルゴリズムを改善し、複雑な縦書きレイアウトのソートロジックを強化して、より良い結果を提供します。
- モデル選択ロジックを強化:言語が指定されているがモデルのバージョンが指定されていない場合、システムはその言語をサポートする最新のモデルバージョンを自動的に選択します。
- MKL-DNNキャッシュサイズにデフォルトの上限を設定し、無制限の増加を防ぎます。同時に、ユーザーがキャッシュ容量を設定することも可能です。
- 高性能推論のデフォルト設定を更新し、Paddle MKL-DNNアクセラレーションをサポートし、よりスマートな選択のための自動設定選択ロジックを最適化しました。
- インストールされているPaddleフレームワークによる計算デバイスの実際のサポートを考慮するようにデフォルトデバイスの取得ロジックを調整し、プログラムの動作をより直感的にしました。
- PP-OCRv5のAndroidサンプルを追加しました。[詳細](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html)。
- **バグ修正:**
- PP-StructureV3の一部のCLIパラメータが有効にならない問題を修正しました。
- 特定のケースで`export_paddlex_config_to_yaml`が正しく機能しない問題を解決しました。
- `save_path`の実際の動作とそのドキュメントの記述との間の不一致を修正しました。
- 基本的なサービス展開でMKL-DNNを使用する際の潜在的なマルチスレッドエラーを修正しました。
- Latex-OCRモデルの画像前処理におけるチャネル順序のエラーを修正しました。
- テキスト認識モジュールで可視化画像を保存する際のチャネル順序のエラーを修正しました。
- PP-StructureV3パイプラインで可視化されたテーブル結果のチャネル順序のエラーを解決しました。
- PP-StructureV3パイプラインで非常に特殊な状況下で`overlap_ratio`を計算する際のオーバーフロー問題を修正しました。
- **ドキュメントの改善:**
- ドキュメント内の`enable_mkldnn`パラメータの説明を更新し、プログラムの実際の動作を正確に反映するようにしました。
- `lang`および`ocr_version`パラメータに関するドキュメントのエラーを修正しました。
- CLIを介してプロダクションライン設定ファイルをエクスポートする手順を追加しました。
- PP-OCRv5のパフォーマンスデータテーブルで欠落していた列を修正しました。
- さまざまな構成におけるPP-StructureV3のベンチマーク指標を洗練しました。
- **その他:**
- numpyやpandasなどの依存関係のバージョン制限を緩和し、Python 3.12のサポートを復元しました。
<details>
<summary><strong>更新履歴</strong></summary>
#### **🔥🔥 2025.06.05: PaddleOCR 3.0.1のリリース、以下の内容を含みます:**
- **一部のモデルとモデル設定の最適化:**
- PP-OCRv5のデフォルトモデル設定を更新し、検出と認識の両方をmobileモデルからserverモデルに変更しました。ほとんどのシーンでのデフォルト性能を向上させるため、設定の`limit_side_len`パラメータを736から64に変更しました。
- 新しいテキスト行方向分類モデル`PP-LCNet_x1_0_textline_ori`精度99.42%を追加しました。OCR、PP-StructureV3、およびPP-ChatOCRv4パイプラインのデフォルトのテキスト行方向分類器がこのモデルに更新されました。
- テキスト行方向分類モデル`PP-LCNet_x0_25_textline_ori`を最適化し、精度が3.3パーセントポイント向上し、現在の精度は98.85%です。
- **バージョン3.0.0の一部の問題の最適化と修正、[詳細](https://paddlepaddle.github.io/PaddleOCR/latest/ja/update/update.html)**
🔥🔥2025.05.20: **PaddleOCR v3.0**の公式リリース、以下の内容を含みます:
- **PP-OCRv5**: あらゆるシーンに対応する高精度テキスト認識モデル - 画像/PDFから瞬時にテキストを抽出。
1. 🌐 単一モデルで**5つ**のテキストタイプをサポート - **簡体字中国語、繁体字中国語、簡体字中国語ピンイン、英語**、**日本語**をシームレスに処理。
2. ✍️ **手書き文字認識**の向上:複雑な草書体や非標準的な手書き文字の認識性能が大幅に向上。
3. 🎯 PP-OCRv4に比べて**13ポイントの精度向上**を達成し、さまざまな実世界のシナリオで最先端の性能を実現。
- **PP-StructureV3**: 汎用ドキュメント解析 実世界のシナリオで最先端の画像/PDF解析を解放
1. 🧮 **高精度な複数シーンPDF解析**により、OmniDocBenchベンチマークでオープンソースおよびクローズドソースのソリューションをリード。
2. 🧠 **印鑑認識**、**グラフからテーブルへの変換**、**ネストされた数式/画像を含むテーブル認識**、**縦書きテキスト文書の解析**、**複雑なテーブル構造分析**などの専門機能。
- **PP-ChatOCRv4**: インテリジェントなドキュメント理解 画像/PDFからテキストだけでなく、キー情報を抽出。
1. 🔥 PDF/PNG/JPGファイルからのキー情報抽出において、前世代に比べて**15ポイントの精度向上**。
2. 💻 **ERNIE 4.5 Turbo**をネイティブサポートし、PaddleNLP、Ollama、vLLMなどを介した大規模モデルのデプロイメントとの互換性あり。
3. 🤝 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2) と統合し、印刷テキスト、手書き文字、印鑑、テーブル、グラフなど、複雑な文書内の一般的な要素の抽出と理解をサポート。
[更新履歴](https://paddlepaddle.github.io/PaddleOCR/latest/ja/update/update.html)
</details>
## ⚡ クイックスタート
### 1. オンラインデモの実行
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. インストール
[インストールガイド](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html) を参照してPaddlePaddleをインストールした後、PaddleOCRツールキットをインストールします。
```bash
# paddleocrのインストール
pip install paddleocr
```
### 3. CLIによる推論の実行
```bash
# PP-OCRv5の推論を実行
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# PP-StructureV3の推論を実行
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# 最初にQianfan APIキーを取得し、その後PP-ChatOCRv4の推論を実行
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# "paddleocr ocr" の詳細情報を取得
paddleocr ocr --help
```
### 4. APIによる推論の実行
**4.1 PP-OCRv5の例**
```python
# PaddleOCRインスタンスの初期化
from paddleocr import PaddleOCR
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# サンプル画像でOCR推論を実行
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# 結果を可視化し、JSON形式で保存
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 PP-StructureV3の例</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# 画像の場合
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# 結果を可視化し、JSON形式で保存
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 PP-ChatOCRv4の例</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# マルチモーダル大規模モデルを使用する場合、ローカルmllmサービスを起動する必要があります。ドキュメントhttps://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md を参照してデプロイを行い、mllm_chat_bot_config設定を更新してください。
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
## ⛰️ 上級チュートリアル
- [PP-OCRv5 チュートリアル](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [PP-StructureV3 チュートリアル](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [PP-ChatOCRv4 チュートリアル](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 実行結果のクイックレビュー
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
</p>
</div>
## 👩‍👩‍👧‍👦 コミュニティ
| PaddlePaddle WeChat公式アカウント | 技術ディスカッショングループへの参加 |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 😃 PaddleOCRを活用した素晴らしいプロジェクト
PaddleOCRは、その素晴らしいコミュニティなしでは今日の姿にはなりえませんでした💗長年のパートナー、新しい協力者、そしてPaddleOCRに情熱を注いでくださったすべての方々に心から感謝申し上げます。皆様のサポートが私たちの原動力です
| プロジェクト名 | 概要 |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|詳細なドキュメント理解に基づくRAGエンジン。|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|複数タイプのドキュメントからMarkdownへの変換ツール|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|無料、オープンソースのバッチオフラインOCRソフトウェア。|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: 純粋なビジョンベースのGUIエージェントのための画面解析ツール。|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |あらゆるものに基づいた質疑応答。|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|複雑で多様なPDFドキュメントから高品質なコンテンツを効率的に抽出するために設計された強力なオープンソースツールキット。|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |画面上のテキストを認識し、翻訳して、リアルタイムで翻訳結果を表示します。|
| [他のプロジェクトを見る](./awesome_projects.md) | [PaddleOCRをベースにした他のプロジェクト](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 貢献者
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 ライセンス
このプロジェクトは[Apache 2.0 license](LICENSE)の下で公開されています。
## 🎓 引用
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1,340 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR 배너">
</p>
<!-- language -->
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | 한국어 | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.8~3.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 소개
PaddleOCR은 출시 이후 최첨단 알고리즘(algorithm)과 실제 애플리케이션(application)에서의 입증된 성능 덕분에 학계, 산업계, 연구 커뮤니티에서 폭넓은 찬사를 받아왔습니다. Umi-OCR, OmniParser, MinerU, RAGFlow와 같은 유명 오픈소스 프로젝트에 이미 적용되어 전 세계 개발자(developer)들에게 필수 OCR 툴킷(toolkit)으로 자리 잡았습니다.
2025년 5월 20일, PaddlePaddle 팀은 **PaddlePaddle 3.0** 프레임워크의 공식 릴리스와 완전히 호환되는 PaddleOCR 3.0을 발표했습니다. 이 업데이트는 **텍스트 인식 정확도를 더욱 향상**시키고, **다중 텍스트 유형 인식** 및 **필기 인식**을 지원하며, 대규모 모델 애플리케이션의 **복잡한 문서의 고정밀 구문 분석**에 대한 증가하는 수요를 충족합니다. **ERNIE 4.5 Turbo**와 결합하면 주요 정보 추출 정확도가 크게 향상됩니다. 사용 설명서 전체는 [PaddleOCR 3.0 문서](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)를 참조하십시오.
PaddleOCR 3.0의 세 가지 주요 신규 기능:
- 범용 장면 텍스트 인식 모델(Universal-Scene Text Recognition Model) [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): 다섯 가지 다른 텍스트 유형과 복잡한 필기체를 처리하는 단일 모델입니다. 전체 인식 정확도는 이전 세대보다 13%p 향상되었습니다. [온라인 체험](https://aistudio.baidu.com/community/app/91660/webUI)
- 일반 문서 파싱(parsing) 솔루션 [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): 다중 레이아웃(multi-layout), 다중 장면 PDF의 고정밀 파싱(parsing)을 제공하며, 공개 벤치마크(benchmark)에서 많은 오픈 소스 및 클로즈드 소스 솔루션을 능가합니다. [온라인 체험](https://aistudio.baidu.com/community/app/518494/webUI)
- 지능형 문서 이해 솔루션 [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): ERNIE 4.5 Turbo에 의해 네이티브로 구동되며, 이전 모델보다 15%p 높은 정확도를 달성합니다. [온라인 체험](https://aistudio.baidu.com/community/app/518493/webUI)
PaddleOCR 3.0은 뛰어난 모델 라이브러리(model library)를 제공할 뿐만 아니라 모델 훈련, 추론 및 서비스 배포를 포괄하는 사용하기 쉬운 도구를 제공하여 개발자가 AI 애플리케이션을 신속하게 상용화할 수 있도록 지원합니다.
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR 아키텍처">
</p>
</div>
## 📣 최신 업데이트
#### **2025.06.29: PaddleOCR 3.1.0 출시**, 주요 내용:
- **주요 모델 및 파이프라인:**
- **PP-OCRv5 다국어 텍스트 인식 모델 추가**, 프랑스어, 스페인어, 포르투갈어, 러시아어, 한국어 등 37개 언어의 텍스트 인식 모델 학습 및 추론 지원. **평균 정확도 30% 이상 향상.** [자세히 보기](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- PP-StructureV3의 **PP-Chart2Table 모델 업그레이드**, 차트에서 표로 변환하는 기능이 더욱 향상됨. 내부 커스텀 평가 세트에서 지표(RMS-F1)가 **9.36%p 상승(71.24% → 80.60%)**.
- PP-StructureV3 및 ERNIE 4.5 Turbo 기반 **문서 번역 파이프라인 PP-DocTranslation 신규 출시**, Markdown 형식 문서, 다양한 복잡 레이아웃의 PDF 문서, 문서 이미지를 번역 지원, 결과는 Markdown 형식으로 저장 가능. [자세히 보기](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
- **새로운 MCP 서버:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
- **OCR 및 PP-StructureV3 파이프라인을 모두 지원합니다.**
- 로컬 Python 라이브러리, AIStudio 커뮤니티 클라우드 서비스, 자체 호스팅 서비스의 세 가지 작업 모드를 지원합니다.
- stdio를 통해 로컬 서비스를 호출하고, Streamable HTTP를 통해 원격 서비스를 호출할 수 있습니다.
- **문서 최적화:** 일부 사용자 가이드 설명 개선으로 읽기 경험 향상.
#### **2025.06.26: PaddleOCR 3.0.3 릴리스, 포함 내용:**
- 버그 수정: `enable_mkldnn` 매개변수가 작동하지 않는 문제를 해결하고, CPU가 기본적으로 MKL-DNN 추론을 사용하는 동작을 복원했습니다.
#### **🔥🔥 2025.06.19: PaddleOCR 3.0.2 릴리스, 포함 내용:**
- **새로운 기능:**
- 모델 기본 다운로드 소스가 `BOS`에서 `HuggingFace`로 변경되었습니다. 사용자는 환경 변수 `PADDLE_PDX_MODEL_SOURCE``BOS`로 설정하여 모델 다운로드 소스를 Baidu Object Storage(BOS)로 되돌릴 수 있습니다.
- PP-OCRv5, PP-StructureV3, PP-ChatOCRv4 파이프라인에 대해 C++, Java, Go, C#, Node.js, PHP 6개 언어의 서비스 호출 예제가 추가되었습니다.
- PP-StructureV3 파이프라인의 레이아웃 파티션 정렬 알고리즘을 개선하여 복잡한 세로 레이아웃의 정렬 논리를 향상했습니다.
- 언어(`lang`)만 지정하고 모델 버전을 명시하지 않은 경우, 해당 언어를 지원하는 최신 모델 버전을 자동으로 선택하도록 모델 선택 로직을 강화했습니다.
- MKL-DNN 캐시 크기에 기본 상한을 설정하여 무한 확장을 방지하고, 사용자 정의 캐시 용량 설정을 지원합니다.
- 고성능 추론의 기본 구성을 업데이트하여 Paddle MKL-DNN 가속을 지원하고, 자동 구성 선택 로직을 최적화했습니다.
- 설치된 Paddle 프레임워크가 지원하는 실제 디바이스를 고려하도록 기본 디바이스 선택 로직을 조정했습니다.
- PP-OCRv5의 Android 예제가 추가되었습니다. [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
- **버그 수정:**
- PP-StructureV3 일부 CLI 파라미터가 적용되지 않던 문제를 수정했습니다.
- `export_paddlex_config_to_yaml`가 특정 상황에서 정상 동작하지 않던 문제를 해결했습니다.
- `save_path`의 실제 동작과 문서 설명이 일치하지 않던 문제를 수정했습니다.
- 기본 서비스화 배포에서 MKL-DNN을 사용할 때 발생할 수 있는 다중 스레딩 오류를 수정했습니다.
- Latex-OCR 모델의 이미지 전처리 과정에서 채널 순서 오류를 수정했습니다.
- 텍스트 인식 모듈에서 시각화 이미지를 저장할 때 발생하던 채널 순서 오류를 수정했습니다.
- PP-StructureV3 파이프라인의 표 시각화 결과에 발생하던 채널 순서 오류를 수정했습니다.
- PP-StructureV3 파이프라인에서 특수한 상황에서 `overlap_ratio` 계산 시 발생하던 오버플로 문제를 수정했습니다.
- **문서 개선:**
- 문서의 `enable_mkldnn` 파라미터 설명을 프로그램의 실제 동작에 맞게 업데이트했습니다.
- `lang``ocr_version` 파라미터에 대한 문서 오류를 수정했습니다.
- CLI를 통해 생산 라인 설정 파일을 내보내는 방법을 문서에 추가했습니다.
- PP-OCRv5 성능 데이터 표에서 누락된 열을 복원했습니다.
- 다양한 구성에서 PP-StructureV3의 벤치마크 지표를 개선했습니다.
- **기타:**
- numpy, pandas 등 의존성 버전 제한을 완화하여 Python 3.12 지원을 복원했습니다.
#### **🔥🔥 2025.06.05: PaddleOCR 3.0.1 릴리스, 포함 내용:**
- **일부 모델 및 모델 구성 최적화:**
- PP-OCRv5의 기본 모델 구성을 업데이트하여 탐지 및 인식을 모두 mobile에서 server 모델로 변경했습니다. 대부분의 시나리오에서 기본 성능을 향상시키기 위해 구성의 `limit_side_len` 파라미터(parameter)가 736에서 64로 변경되었습니다.
- 99.42%의 정확도를 가진 새로운 텍스트 라인 방향 분류 모델 `PP-LCNet_x1_0_textline_ori`를 추가했습니다. OCR, PP-StructureV3, PP-ChatOCRv4 파이프라인의 기본 텍스트 라인 방향 분류기가 이 모델로 업데이트되었습니다.
- 텍스트 라인 방향 분류 모델 `PP-LCNet_x0_25_textline_ori`를 최적화하여 정확도를 3.3%p 향상시켜 현재 정확도는 98.85%입니다.
- **버전 3.0.0의 일부 문제점에 대한 최적화 및 수정, [상세 정보](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
🔥🔥2025.05.20: **PaddleOCR v3.0** 정식 출시, 포함 내용:
- **PP-OCRv5**: 모든 시나리오를 위한 고정밀 텍스트 인식 모델 - 이미지/PDF에서 즉시 텍스트 추출.
1. 🌐 단일 모델로 **다섯 가지** 텍스트 유형 지원 - **중국어 간체, 중국어 번체, 중국어 간체 병음, 영어, 일본어**를 단일 모델 내에서 원활하게 처리합니다.
2. ✍️ 향상된 **필기체 인식**: 복잡한 흘림체 및 비표준 필기체에서 성능이 크게 향상되었습니다.
3. 🎯 PP-OCRv4에 비해 **정확도 13%p 향상**, 다양한 실제 시나리오에서 SOTA(state-of-the-art) 성능을 달성했습니다.
- **PP-StructureV3**: 범용 문서 파싱(parsing) 실제 시나리오를 위한 SOTA 이미지/PDF 파싱(parsing) 성능!
1. 🧮 **고정밀 다중 장면 PDF 파싱(parsing)**, OmniDocBench 벤치마크(benchmark)에서 오픈 소스 및 클로즈드 소스 솔루션을 모두 능가합니다.
2. 🧠 전문 기능에는 **도장 인식**, **차트-표 변환**, **중첩된 수식/이미지가 있는 표 인식**, **세로 텍스트 문서 파싱(parsing)**, **복잡한 표 구조 분석** 등이 포함됩니다.
- **PP-ChatOCRv4**: 지능형 문서 이해 이미지/PDF에서 단순한 텍스트가 아닌 핵심 정보 추출.
1. 🔥 이전 세대에 비해 PDF/PNG/JPG 파일의 핵심 정보 추출에서 **정확도 15%p 향상**.
2. 💻 **ERNIE 4.5 Turbo** 기본 지원, PaddleNLP, Ollama, vLLM 등을 통한 대규모 모델 배포와 호환됩니다.
3. 🤝 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2)와 통합되어 인쇄된 텍스트, 필기체, 도장, 표, 차트 등 복잡한 문서의 일반적인 요소 추출 및 이해를 지원합니다.
[히스토리 로그](https://paddlepaddle.github.io/PaddleOCR/latest/en/update.html)
</details>
## ⚡ 빠른 시작
### 1. 온라인 데모 실행
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. 설치
[설치 가이드](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)를 참조하여 PaddlePaddle을 설치한 후, PaddleOCR 툴킷을 설치하십시오.
```bash
# paddleocr 설치
pip install paddleocr
```
### 3. CLI를 통한 추론 실행
```bash
# PP-OCRv5 추론 실행
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# PP-StructureV3 추론 실행
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# 먼저 Qianfan API 키를 받고, PP-ChatOCRv4 추론 실행
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# "paddleocr ocr"에 대한 추가 정보 얻기
paddleocr ocr --help
```
### 4. API를 통한 추론 실행
**4.1 PP-OCRv5 예제**
```python
from paddleocr import PaddleOCR
# PaddleOCR 인스턴스 초기화
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# 샘플 이미지에 대해 OCR 추론 실행
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# 결과 시각화 및 JSON 결과 저장
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 PP-StructureV3 예제</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# 이미지용
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# 결과 시각화 및 JSON 결과 저장
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 PP-ChatOCRv4 예제</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# 다중 모드 대형 모델을 사용하는 경우 로컬 mllm 서비스를 시작해야 합니다. 문서: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md를 참조하여 배포하고 mllm_chat_bot_config 구성을 업데이트할 수 있습니다.
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
## ⛰️ 고급 튜토리얼
- [PP-OCRv5 튜토리얼](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [PP-StructureV3 튜토리얼](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [PP-ChatOCRv4 튜토리얼](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 실행 결과 빠른 개요
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 데모">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 데모">
</p>
</div>
## 👩‍👩‍👧‍👦 커뮤니티
| PaddlePaddle 위챗(WeChat) 공식 계정 | 기술 토론 그룹 가입 |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 🏆 PaddleOCR을 활용하는 우수 프로젝트
PaddleOCR의 발전은 커뮤니티 없이는 불가능합니다! 💗 오랜 파트너, 새로운 협력자, 그리고 이름을 언급했든 안 했든 PaddleOCR에 열정을 쏟아부은 모든 분들께 진심으로 감사드립니다. 여러분의 지원이 우리의 원동력입니다!
| 프로젝트 이름 | 설명 |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|심층 문서 이해 기반의 RAG 엔진.|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|다중 유형 문서를 마크다운(Markdown)으로 변환하는 도구|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|무료, 오픈 소스, 배치 오프라인 OCR 소프트웨어.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |순수 비전 기반 GUI 에이전트를 위한 화면 파싱(parsing) 도구.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |무엇이든 기반으로 한 질의응답 시스템.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|복잡하고 다양한 PDF 문서에서 고품질 콘텐츠를 효율적으로 추출하도록 설계된 강력한 오픈 소스 툴킷.|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |화면의 텍스트를 인식하여 번역하고 번역 결과를 실시간으로 표시합니다.|
| [Learn more projects](./awesome_projects.md) | [More projects based on PaddleOCR](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 기여자
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 라이선스
이 프로젝트는 [Apache 2.0 license](LICENSE)에 따라 배포됩니다.
## 🎓 인용
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1,348 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner.png" alt="Баннер PaddleOCR">
</p>
<!-- language -->
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | Русский | [Español](./README_es.md) | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.83.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 Введение
С момента своего первого выпуска PaddleOCR получил широкое признание в академических, промышленных и исследовательских кругах благодаря своим передовым алгоритмам и доказанной производительности в реальных приложениях. Он уже используется в таких популярных проектах с открытым исходным кодом, как Umi-OCR, OmniParser, MinerU и RAGFlow, что делает его предпочтительным инструментарием OCR для разработчиков по всему миру.
20 мая 2025 года команда PaddlePaddle представила PaddleOCR 3.0, полностью совместимый с официальным выпуском фреймворка **PaddlePaddle 3.0**. Это обновление еще больше **повышает точность распознавания текста**, добавляет поддержку **распознавания нескольких типов текста** и **распознавания рукописного текста**, а также удовлетворяет растущий спрос на приложения с большими моделями для **высокоточного анализа сложных документов**. В сочетании с **ERNIE 4.5 Turbo** он значительно улучшает точность извлечения ключевой информации. PaddleOCR 3.0 также вводит поддержку китайских гетерогенных AI ускорителей, таких как **KUNLUNXIN** и **Ascend**. Для получения полной документации по использованию, пожалуйста, обратитесь к [Документации PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
Три новые ключевые функции в PaddleOCR 3.0:
- Универсальная модель распознавания текста в любых сценах [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): Одна модель обрабатывает пять различных типов текста и сложный рукописный ввод. Общая точность распознавания увеличилась на 13 процентных пунктов по сравнению с предыдущим поколением. [Онлайн-демо](https://aistudio.baidu.com/community/app/91660/webUI)
- Общее решение для парсинга документов [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Обеспечивает высокоточный парсинг PDF-файлов с различными макетами и сценариями, превосходя многие решения с открытым и закрытым исходным кодом по результатам публичных тестов. [Онлайн-демо](https://aistudio.baidu.com/community/app/518494/webUI)
- Интеллектуальное решение для понимания документов [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Нативно поддерживается большим моделью ERNIE 4.5 Turbo, достигая на 15 процентных пунктов более высокой точности, чем его предшественник. [Онлайн-демо](https://aistudio.baidu.com/community/app/518493/webUI)
Помимо предоставления выдающейся библиотеки моделей, PaddleOCR 3.0 также предлагает удобные инструменты, охватывающие обучение моделей, инференс и развертывание сервисов, чтобы разработчики могли быстро внедрять ИИ-приложения в производство.
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch.png" alt="Архитектура PaddleOCR">
</p>
</div>
## 📣 Последние обновления
#### **2025.06.29: Выпуск PaddleOCR 3.1.0**, включает:
- **Основные модели и пайплайны:**
- **Добавлена многоязычная модель распознавания текста PP-OCRv5**, поддерживающая обучение и инференс для 37 языков, включая французский, испанский, португальский, русский, корейский и др. **Средняя точность увеличилась более чем на 30%.** [Подробнее](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- Обновлена **модель PP-Chart2Table** в PP-StructureV3, что еще больше улучшило преобразование графиков в таблицы. На внутренних пользовательских тестах метрика (RMS-F1) **увеличилась на 9,36 процентных пункта (71,24% -> 80,60%).**
- Запущен новый **конвейер перевода документов PP-DocTranslation на основе PP-StructureV3 и ERNIE 4.5 Turbo**, поддерживающий перевод документов в формате Markdown, различных PDF-документов со сложной версткой и изображений документов, с сохранением результата в формате Markdown. [Подробнее](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
- **Новый сервер MCP:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
- **Поддерживает как OCR, так и конвейеры PP-StructureV3.**
- Поддерживаются три режима работы: локальная библиотека Python, облачный сервис сообщества AIStudio и самостоятельный хостинг.
- Поддерживается вызов локальных сервисов через stdio и удалённых сервисов через Streamable HTTP.
- **Оптимизация документации:** Улучшены описания в некоторых руководствах пользователя для более комфортного чтения.
#### **2025.06.26: Релиз PaddleOCR 3.0.3, включает:**
- Исправление ошибки: Исправлена проблема, из-за которой параметр `enable_mkldnn` не действовал, восстановлено поведение использования MKL-DNN для вывода на CPU по умолчанию.
#### 🔥🔥**2025.06.19: Релиз PaddleOCR 3.0.2, включает:**
- **Новые возможности:**
- Источник загрузки по умолчанию изменен с `BOS` на `HuggingFace`. Пользователи также могут изменить переменную окружения `PADDLE_PDX_MODEL_SOURCE` на `BOS`, чтобы установить источник загрузки моделей обратно на Baidu Object Storage (BOS).
- Добавлены примеры вызова сервисов для шести языков — C++, Java, Go, C#, Node.js и PHP — для пайплайнов, таких как PP-OCRv5, PP-StructureV3 и PP-ChatOCRv4.
- Улучшен алгоритм сортировки разделов макета в пайплайне PP-StructureV3, улучшена логика сортировки для сложных вертикальных макетов для достижения лучших результатов.
- Улучшена логика выбора модели: когда указан язык, но не указана версия модели, система автоматически выберет последнюю версию модели, поддерживающую этот язык.
- Установлен верхний предел по умолчанию для размера кэша MKL-DNN для предотвращения неограниченного роста, а также предоставлена пользователям возможность настраивать емкость кэша.
- Обновлены конфигурации по умолчанию для высокопроизводительного инференса для поддержки ускорения Paddle MKL-DNN и оптимизирована логика автоматического выбора конфигурации для более разумного выбора.
- Скорректирована логика получения устройства по умолчанию для учета фактической поддержки вычислительных устройств установленным фреймворком Paddle, что делает поведение программы более интуитивным.
- Добавлен пример для Android для PP-OCRv5. [Подробности](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
- **Исправления ошибок:**
- Исправлена проблема с некоторыми параметрами CLI в PP-StructureV3, которые не вступали в силу.
- Решена проблема, из-за которой `export_paddlex_config_to_yaml` в некоторых случаях работала некорректно.
- Исправлено несоответствие между фактическим поведением `save_path` и его описанием в документации.
- Исправлены потенциальные ошибки многопоточности при использовании MKL-DNN в базовом развертывании сервиса.
- Исправлены ошибки порядка каналов в предварительной обработке изображений для модели Latex-OCR.
- Исправлены ошибки порядка каналов при сохранении визуализированных изображений в модуле распознавания текста.
- Решены ошибки порядка каналов в визуализированных результатах таблиц в пайплайне PP-StructureV3.
- Исправлена проблема переполнения при вычислении `overlap_ratio` в крайне особых обстоятельствах в пайплайне PP-StructureV3.
- **Улучшения документации:**
- Обновлено описание параметра `enable_mkldnn` в документации, чтобы оно точно отражало фактическое поведение программы.
- Исправлены ошибки в документации, касающиеся параметров `lang` и `ocr_version`.
- Добавлены инструкции по экспорту файлов конфигурации производственной линии через CLI.
- Исправлены отсутствующие столбцы в таблице данных о производительности для PP-OCRv5.
- Уточнены метрики бенчмарков для PP-StructureV3 для различных конфигураций.
- **Прочее:**
- Ослаблены ограничения версий для зависимостей, таких как numpy и pandas, восстановлена поддержка Python 3.12.
<details>
<summary><strong>История обновлений</strong></summary>
#### **🔥🔥 2025.06.05: Релиз PaddleOCR 3.0.1, включает:**
- **Оптимизация некоторых моделей и их конфигураций:**
- Обновлена конфигурация модели по умолчанию для PP-OCRv5: модели обнаружения и распознавания изменены с `mobile` на `server`. Для улучшения производительности по умолчанию в большинстве сценариев параметр `limit_side_len` в конфигурации изменен с 736 на 64.
- Добавлена новая модель классификации ориентации строк текста `PP-LCNet_x1_0_textline_ori` с точностью 99.42%. Классификатор ориентации строк текста по умолчанию для пайплайнов OCR, PP-StructureV3 и PP-ChatOCRv4 обновлен до этой модели.
- Оптимизирована модель классификации ориентации строк текста `PP-LCNet_x0_25_textline_ori`, точность улучшена на 3.3 процентных пункта до текущего значения 98.85%.
- **Оптимизации и исправления некоторых проблем версии 3.0.0, [подробности](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
🔥🔥2025.05.20: Официальный релиз **PaddleOCR v3.0**, включающий:
- **PP-OCRv5**: Высокоточная модель распознавания текста для всех сценариев - Мгновенное извлечение текста из изображений/PDF.
1. 🌐 Поддержка **пяти** типов текста в одной модели - Бесшовная обработка **упрощенного китайского, традиционного китайского, пиньиня, английского** и **японского** в рамках одной модели.
2. ✍️ Улучшенное **распознавание рукописного текста**: Значительно лучше справляется со сложными слитными и нестандартными почерками.
3. 🎯 **Прирост точности на 13 процентных пунктов** по сравнению с PP-OCRv4, достижение самых современных результатов в различных реальных сценариях.
- **PP-StructureV3**: Универсальный парсинг документов Используйте SOTA парсинг изображений/PDF для реальных сценариев!
1. 🧮 **Высокоточный парсинг PDF в различных сценариях**, опережающий как открытые, так и закрытые решения на бенчмарке OmniDocBench.
2. 🧠 Специализированные возможности включают **распознавание печатей**, **преобразование диаграмм в таблицы**, **распознавание таблиц с вложенными формулами/изображениями**, **парсинг документов с вертикальным текстом** и **анализ сложных структур таблиц**.
- **PP-ChatOCRv4**: Интеллектуальное понимание документов Извлекайте ключевую информацию, а не просто текст из изображений/PDF.
1. 🔥 **Прирост точности на 15 процентных пунктов** в извлечении ключевой информации из файлов PDF/PNG/JPG по сравнению с предыдущим поколением.
2. 💻 Нативная поддержка **ERNIE 4.5 Turbo**, с совместимостью для развертывания больших моделей через PaddleNLP, Ollama, vLLM и другие.
3. 🤝 Интегрирован [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), обеспечивающий извлечение и понимание печатного текста, рукописного текста, печатей, таблиц, диаграмм и других общих элементов в сложных документах.
[История обновлений](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
</details>
## ⚡ Быстрый старт
### 1. Запустить онлайн-демо
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. Установка
Установите PaddlePaddle, следуя [Руководству по установке](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), после чего установите инструментарий PaddleOCR.
```bash
# Установить paddleocr
pip install paddleocr
```
### 3. Запуск инференса через CLI
```bash
# Запустить инференс PP-OCRv5
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# Запустить инференс PP-StructureV3
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# Сначала получите Qianfan API Key, а затем запустите инференс PP-ChatOCRv4
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# Получить больше информации о "paddleocr ocr"
paddleocr ocr --help
```
### 4. Запуск инференса через API
**4.1 Пример для PP-OCRv5**
```python
# Инициализация экземпляра PaddleOCR
from paddleocr import PaddleOCR
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# Запуск инференса OCR на примере изображения
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# Визуализация результатов и сохранение в формате JSON
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 Пример для PP-StructureV3</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# Для изображений
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# Визуализация результатов и сохранение в формате JSON
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 Пример для PP-ChatOCRv4</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # ваш api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # ваш api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# Если используется мультимодальная большая модель, необходимо запустить локальный сервис mllm. Вы можете обратиться к документации: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md для выполнения развертывания и обновления конфигурации mllm_chat_bot_config.
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # URL вашего локального сервиса mllm
"api_type": "openai",
"api_key": "api_key", # ваш api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
### 5. Китайские гетерогенные ИИ-ускорители
- [Huawei Ascend](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
- [KUNLUNXIN](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
## ⛰️ Продвинутые руководства
- [Руководство по PP-OCRv5](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [Руководство по PP-StructureV3](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [Руководство по PP-ChatOCRv4](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 Краткий обзор результатов выполнения
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="Демо PP-OCRv5">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="Демо PP-StructureV3">
</p>
</div>
## 👩‍👩‍👧‍👦 Сообщество
| Официальный аккаунт PaddlePaddle в WeChat | Присоединяйтесь к группе для технических обсуждений |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 😃 Потрясающие проекты, использующие PaddleOCR
PaddleOCR не был бы там, где он есть сегодня, без своего невероятного сообщества! 💗 Огромное спасибо всем нашим давним партнерам, новым сотрудникам и всем, кто вложил свою страсть в PaddleOCR — независимо от того, назвали мы вас или нет. Ваша поддержка разжигает наш огонь!
| Название проекта | Описание |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|RAG-движок, основанный на глубоком понимании документов.|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Инструмент для преобразования документов различных типов в Markdown|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Бесплатное офлайн-программное обеспечение для пакетного OCR с открытым исходным кодом.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |Инструмент парсинга экрана для GUI-агента, основанного исключительно на компьютерном зрении.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Система вопросов и ответов на основе любого контента.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|Мощный инструментарий с открытым исходным кодом, предназначенный для эффективного извлечения высококачественного контента из сложных и разнообразных PDF-документов.|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |Распознает текст на экране, переводит его и отображает результаты перевода в режиме реального времени.|
| [Узнать больше о проектах](./awesome_projects.md) | [Больше проектов на основе PaddleOCR](./awesome_projects.md)|
## 👩‍👩‍👧‍👦 Контрибьюторы
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 Лицензия
Этот проект выпущен под [лицензией Apache 2.0](LICENSE).
## 🎓 Цитирование
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1,345 @@
<div align="center">
<p>
<img width="100%" src="./docs/images/Banner_cn.png" alt="PaddleOCR 橫幅">
</p>
<!-- language -->
[English](./README.md) | [简体中文](./README_cn.md) | 繁體中文 | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
<!-- icon -->
[![stars](https://img.shields.io/github/stars/PaddlePaddle/PaddleOCR?color=ccf)](https://github.com/PaddlePaddle/PaddleOCR)
[![Downloads](https://img.shields.io/pypi/dm/paddleocr)](https://pypi.org/project/PaddleOCR/)
![python](https://img.shields.io/badge/python-3.8~3.12-aff.svg)
![os](https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg)
![hardware](https://img.shields.io/badge/hardware-cpu%2C%20gpu%2C%20xpu%2C%20npu-yellow.svg)
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
</div>
## 🚀 簡介
PaddleOCR 自發布以來,憑藉其學術前沿的演算法與產業落地實踐,深受產學研各界的喜愛,並廣泛應用於眾多知名開源專案,如 Umi-OCR、OmniParser、MinerU、RAGFlow 等,已成為廣大開發者心中開源 OCR 領域的首選工具。2025 年 5 月 20 日,飛槳團隊發布 **PaddleOCR 3.0**,全面適配**飛槳框架 3.0 正式版**,進一步**提升文字辨識精度**,支援**多種文字類型辨識**和**手寫體辨識**,滿足大型模型應用對**複雜文件高精度解析**的旺盛需求。結合**ERNIE 4.5 Turbo**,顯著提升了關鍵資訊擷取的精度,並新增**對崑崙芯、昇騰等國產硬體**的支援。完整使用說明請參閱 [PaddleOCR 3.0 文檔](https://paddlepaddle.github.io/PaddleOCR/latest/)。
PaddleOCR 3.0 **新增**三大特色功能:
- 全場景文字辨識模型 [PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):單一模型支援五種文字類型和複雜手寫體辨識;整體辨識精度相較前一代**提升 13 個百分點**。[線上體驗](https://aistudio.baidu.com/community/app/91660/webUI)
- 通用文件解析方案 [PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支援多場景、多版式的 PDF 高精度解析,在公開評測集中**領先眾多開源與閉源方案**。[線上體驗](https://aistudio.baidu.com/community/app/518494/webUI)
- 智慧文件理解方案 [PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md)原生支援ERNIE 4.5 Turbo精度相較前一代**提升 15 個百分點**。[線上體驗](https://aistudio.baidu.com/community/app/518493/webUI)
除了提供優秀的模型庫PaddleOCR 3.0 還提供好學易用的工具,涵蓋模型訓練、推論及服務化部署,方便開發者快速將 AI 應用落地。
<div align="center">
<p>
<img width="100%" src="./docs/images/Arch_cn.png" alt="PaddleOCR 架構">
</p>
</div>
## 📣 最新動態
**🔥🔥2025.06.29:發布 PaddleOCR 3.1.0**,內容包括:
- **主要模型與流程:**
- **新增 PP-OCRv5 多語言文字識別模型**,支援包括法語、西班牙語、葡萄牙語、俄語、韓語等在內的 37 種語言的文字識別模型訓練與推理。**平均準確率提升超過 30%。** [詳情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
- 升級了 PP-StructureV3 的 **PP-Chart2Table 模型**進一步提升圖表轉表格能力。在內部自訂評測集上指標RMS-F1**提升了 9.36 個百分點71.24% -> 80.60%)。**
- 新增基於 PP-StructureV3 和 ERNIE 4.5 Turbo 的**文件翻譯流程 PP-DocTranslation**,支援 Markdown 格式文件、各種複雜版面 PDF 文件及文件圖片翻譯,結果可儲存為 Markdown 格式文件。[詳情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-DocTranslation.html)
- **新增 MCP server**[Details](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/mcp_server.html)
- **支援 OCR 及 PP-StructureV3 流程。**
- 支援三種工作模式:本地 Python 函式庫、AIStudio 社群雲端服務、自主託管服務。
- 支援通過 stdio 調用本地服務,通過 Streamable HTTP 調用遠端服務。
- **文件優化:** 優化了部分使用說明文件描述,提升閱讀體驗。
2025.06.26: **PaddleOCR 3.0.3** 發布,包含:
- 錯誤修復:修復`enable_mkldnn`參數不生效的問題恢復CPU默認使用MKL-DNN推理的行為。
2025.06.19: **PaddleOCR 3.0.2** 發布,包含:
- **功能新增:**
- 模型預設下載來源從`BOS`改為`HuggingFace`,同時也支援使用者透過更改環境變數`PADDLE_PDX_MODEL_SOURCE`為`BOS`,將模型下載來源設定為百度雲端物件儲存 BOS。
- PP-OCRv5、PP-StructureV3、PP-ChatOCRv4 等 pipeline 新增 C++、Java、Go、C#、Node.js、PHP 6 種語言的服務呼叫範例。
- 優化 PP-StructureV3 產線中版面分區排序演算法,對複雜直書版面排序邏輯進行完善,進一步提升了複雜版面排序效果。
- 優化模型選擇邏輯,當指定語言、未指定模型版本時,自動選擇支援該語言的最新版本的模型。
- 為 MKL-DNN 快取大小設定預設上限,防止快取無限增長。同時,支援使用者設定快取容量。
- 更新高效能推論預設設定,支援 Paddle MKL-DNN 加速。優化高效能推論自動設定邏輯,支援更智慧的設定選擇。
- 調整預設裝置取得邏輯,考量環境中安裝的 Paddle 框架對運算裝置的實際支援情況,使程式行為更符合直覺。
- 新增 PP-OCRv5 的 Android 端範例,[詳情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/on_device_deployment.html)。
- **錯誤修復:**
- 修復 PP-StructureV3 部分 CLI 參數不生效的問題。
- 修復部分情況下 `export_paddlex_config_to_yaml` 無法正常運作的問題。
- 修復 save_path 實際行為與文件描述不符的問題。
- 修復基礎服務化部署在使用 MKL-DNN 時可能出現的多執行緒錯誤。
- 修復 Latex-OCR 模型的影像預處理通道順序錯誤。
- 修復文字辨識模組儲存視覺化影像的通道順序錯誤。
- 修復 PP-StructureV3 中表格視覺化結果通道順序錯誤。
- 修復 PP-StructureV3 產線中極特殊情況下,計算 overlap_ratio 時,變數溢位問題。
- **文件優化:**
- 更新文件中對 `enable_mkldnn` 參數的說明,使其更準確地描述程式的實際行為。
- 修復文件中對 `lang``ocr_version` 參數描述的錯誤。
- 補充透過 CLI 匯出產線設定檔案的說明。
- 修復 PP-OCRv5 效能資料表格中的欄位缺失問題。
- 潤飾 PP-StructureV3 在不同設定下的 benchmark 指標。
- **其他:**
- 放寬 numpy、pandas 等依賴項的版本限制,恢復對 Python 3.12 的支援。
<details>
<summary><strong>歷史日誌</strong></summary>
🔥🔥2025.06.05: **PaddleOCR 3.0.1** 發布,包含:
- **優化部分模型和模型設定:**
- 更新 PP-OCRv5 預設模型設定,偵測和辨識模型均由 mobile 改為 server 模型。為改善多數場景下的預設效果,設定中的參數 `limit_side_len` 由 736 改為 64。
- 新增文字行方向分類模型 `PP-LCNet_x1_0_textline_ori`,精度達 99.42%。OCR、PP-StructureV3、PP-ChatOCRv4 流程的預設文字行方向分類器已更新為此模型。
- 優化文字行方向分類模型 `PP-LCNet_x0_25_textline_ori`,精度提升 3.3 個百分點,目前精度為 98.85%。
- **優化及修復 3.0.0 版本的部分問題,[詳情](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)**
🔥🔥2025.05.20: **PaddleOCR 3.0** 正式發布,包含:
- **PP-OCRv5**: 全場景高精度文字辨識
1. 🌐 單一模型支援**五種**文字類型(**簡體中文**、**繁體中文**、**中文拼音**、**英文**和**日文**)。
2. ✍️ 支援複雜**手寫體**辨識:顯著提升對複雜連筆、非標準字跡的辨識效能。
3. 🎯 整體辨識精度提升:在多種應用場景達到 SOTA 精度,相較於上一版 PP-OCRv4辨識精度**提升 13 個百分點**
- **PP-StructureV3**: 通用文件解析方案
1. 🧮 支援多場景 PDF 高精度解析,在 OmniDocBench 基準測試中**領先眾多開源與閉源方案**。
2. 🧠 多項專業功能:**印章辨識**、**圖表轉表格**、**含嵌套公式/圖片的表格辨識**、**直書文字解析**及**複雜表格結構分析**等。
- **PP-ChatOCRv4**: 智慧文件理解方案
1. 🔥 文件影像PDF/PNG/JPG關鍵資訊擷取精度相較前一代**提升 15 個百分點**
2. 💻 原生支援**ERNIE 4.5 Turbo**,並相容 PaddleNLP、Ollama、vLLM 等工具部署的大型模型。
3. 🤝 整合 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2),支援印刷體、手寫體、印章、表格、圖表等複雜文件元素的資訊擷取與理解。
[更多日誌](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)
</details>
## ⚡ 快速入門
### 1. 線上體驗
[![AI Studio](https://img.shields.io/badge/PP_OCRv5-AI_Studio-green)](https://aistudio.baidu.com/community/app/91660/webUI)
[![AI Studio](https://img.shields.io/badge/PP_StructureV3-AI_Studio-green)](https://aistudio.baidu.com/community/app/518494/webUI)
[![AI Studio](https://img.shields.io/badge/PP_ChatOCRv4-AI_Studio-green)](https://aistudio.baidu.com/community/app/518493/webUI)
### 2. 本機安裝
請參考[安裝指南](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)完成 **PaddlePaddle 3.0** 的安裝,然後安裝 paddleocr。
```bash
# 安裝 paddleocr
pip install paddleocr
```
### 3. 命令列推論
```bash
# 執行 PP-OCRv5 推論
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
# 執行 PP-StructureV3 推論
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
# 執行 PP-ChatOCRv4 推論前,需先取得千帆 API Key
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 駕駛室准乘人數 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
# 查看 "paddleocr ocr" 詳細參數
paddleocr ocr --help
```
### 4. API 推論
**4.1 PP-OCRv5 範例**
```python
from paddleocr import PaddleOCR
# 初始化 PaddleOCR 執行個體
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False)
# 對範例圖片執行 OCR 推論
result = ocr.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
# 將結果視覺化並儲存為 JSON
for res in result:
res.print()
res.save_to_img("output")
res.save_to_json("output")
```
<details>
<summary><strong>4.2 PP-StructureV3 範例</strong></summary>
```python
from pathlib import Path
from paddleocr import PPStructureV3
pipeline = PPStructureV3(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
# 針對圖片
output = pipeline.predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
)
# 將結果視覺化並儲存為 JSON
for res in output:
res.print()
res.save_to_json(save_path="output")
res.save_to_markdown(save_path="output")
```
</details>
<details>
<summary><strong>4.3 PP-ChatOCRv4 範例</strong></summary>
```python
from paddleocr import PPChatOCRv4Doc
chat_bot_config = {
"module_name": "chat_bot",
"model_name": "ernie-3.5-8k",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "openai",
"api_key": "api_key", # your api_key
}
retriever_config = {
"module_name": "retriever",
"model_name": "embedding-v1",
"base_url": "https://qianfan.baidubce.com/v2",
"api_type": "qianfan",
"api_key": "api_key", # your api_key
}
pipeline = PPChatOCRv4Doc(
use_doc_orientation_classify=False,
use_doc_unwarping=False
)
visual_predict_res = pipeline.visual_predict(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
use_common_ocr=True,
use_seal_recognition=True,
use_table_recognition=True,
)
mllm_predict_info = None
use_mllm = False
# 若使用多模態大型模型,需啟動本機 mllm 服務可參考文件https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md 進行部署,並更新 mllm_chat_bot_config 設定。
if use_mllm:
mllm_chat_bot_config = {
"module_name": "chat_bot",
"model_name": "PP-DocBee",
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
"api_type": "openai",
"api_key": "api_key", # your api_key
}
mllm_predict_res = pipeline.mllm_pred(
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
key_list=["驾驶室准乘人数"],
mllm_chat_bot_config=mllm_chat_bot_config,
)
mllm_predict_info = mllm_predict_res["mllm_res"]
visual_info_list = []
for res in visual_predict_res:
visual_info_list.append(res["visual_info"])
layout_parsing_result = res["layout_parsing_result"]
vector_info = pipeline.build_vector(
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
)
chat_result = pipeline.chat(
key_list=["驾驶室准乘人数"],
visual_info=visual_info_list,
vector_info=vector_info,
mllm_predict_info=mllm_predict_info,
chat_bot_config=chat_bot_config,
retriever_config=retriever_config,
)
print(chat_result)
```
</details>
### 5. **國產硬體支援**
- [崑崙芯安裝指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
- [昇騰安裝指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
## ⛰️ 進階指南
- [PP-OCRv5 使用教學](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
- [PP-StructureV3 使用教學](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
- [PP-ChatOCRv4 使用教學](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
## 🔄 效果展示
<div align="center">
<p>
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
</p>
</div>
<div align="center">
<p>
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
</p>
</div>
## 👩‍👩‍👧‍👦 開發者社群
| 掃描 QR Code 關注飛槳官方帳號 | 掃描 QR Code 加入技術交流群組 |
| :---: | :---: |
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
## 🏆 採用 PaddleOCR 的優秀專案
PaddleOCR 的發展離不開社群的貢獻!💗 衷心感謝所有的開發者、合作夥伴與貢獻者!
| 專案名稱 | 簡介 |
| ------------ | ----------- |
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|基於 RAG 的 AI 工作流引擎|
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|多類型文件轉 Markdown 工具|
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|開源批次離線 OCR 軟體|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |基於純視覺的 GUI Agent 螢幕解析工具|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |基於任意內容的問答系統|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|高效複雜 PDF 文件擷取工具套件|
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |螢幕即時翻譯工具|
| [更多專案](./awesome_projects.md) | |
## 👩‍👩‍👧‍👦 貢獻者
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
</a>
## 🌟 Star
[![Star History Chart](https://api.star-history.com/svg?repos=PaddlePaddle/PaddleOCR&type=Date)](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
## 📄 授權條款
本專案的發布受 [Apache 2.0 license](LICENSE) 授權條款認證。
## 🎓 學術引用
```
@misc{paddleocr2020,
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
year={2020}
}
```

View File

@ -0,0 +1 @@
移步[docs](https://paddlepaddle.github.io/PaddleOCR/latest/applications/overview.html)

View File

@ -0,0 +1,29 @@
## 😃 Awesome projects based on PaddleOCR
💗 PaddleOCR wouldnt be where it is today without its incredible community! A massive 🙌 thank you 🙌 to all our longtime partners, new collaborators, and everyone whos poured their passion into PaddleOCR — whether weve named you or not. Your support fuels our fire! 🔥
| Project Name | Description |
| ------------ | ----------- |
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Free, Open-source, Batch Offline OCR Software.|
| [LearnOpenCV](http://github.com/spmallick/learnopencv) <a href="http://github.com/spmallick/learnopencv"><img src="https://img.shields.io/github/stars/spmallick/learnopencv"></a> | code for Computer Vision, Deep learning, and AI research articles.|
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Screen Parsing tool for Pure Vision Based GUI Agent.|
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Question and Answer based on Anything.|
| [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)<a href="https://github.com/PaddlePaddle/PaddleHub"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleHub"></a> |400+ AI Models: Rich, high-quality AI models, including CV, NLP, Speech, Video and Cross-Modal.|
| [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)<a href="https://github.com/PaddlePaddle/PaddleNLP"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleNLP"></a> |A Large Language Model (LLM) development suite based on the PaddlePaddle.|
| [Rerun](https://github.com/rerun-io/rerun) <a href="https://github.com/rerun-io/rerun"><img src="https://img.shields.io/github/stars/rerun-io/rerun"></a> | Rerun is building the multimodal data stack to model, ingest, store, query and view robotics-style data |
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator) <a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> | Recognize text on the screen, translate it and show the translation results in real time.|
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a> | PDF-Extract-Kit is a powerful open-source toolkit designed to efficiently extract high-quality content from complex and diverse PDF documents. |
| [manga-image-translator](https://github.com/zyddnys/manga-image-translator) <a href="https://github.com/zyddnys/manga-image-translator"><img src="https://img.shields.io/github/stars/zyddnys/manga-image-translator"></a> | Translate texts in manga/images.|
| [March7thAssistant](https://github.com/moesnow/March7thAssistant) <a href="https://github.com/moesnow/March7thAssistant"><img src="https://img.shields.io/github/stars/moesnow/March7thAssistant"></a> | Daily Tasks: Stamina recovery, daily training, claiming rewards, commissions, and farming. |
| [PaddlePaddle/models](https://github.com/PaddlePaddle/models) <a href="https://github.com/PaddlePaddle/models"><img src="https://img.shields.io/github/stars/PaddlePaddle/models"></a> |PaddlePaddle's industrial-grade model zoo.|
| [katanaml/sparrow](https://github.com/katanaml/sparrow) <a href="https://github.com/katanaml/sparrow"><img src="https://img.shields.io/github/stars/katanaml/sparrow"></a> | Sparrow is an innovative open-source solution for efficient data extraction and processing from various documents and images. |
| [RapidOCR](https://github.com/RapidAI/RapidOCR) <a href="https://github.com/RapidAI/RapidOCR"><img src="https://img.shields.io/github/stars/RapidAI/RapidOCR"></a> | Awesome OCR multiple programing languages toolkits based on ONNXRuntime, OpenVINO, PaddlePaddle and PyTorch |
| [autoMate](https://github.com/yuruotong1/autoMate) <a href="https://github.com/yuruotong1/autoMate"><img src="https://img.shields.io/github/stars/yuruotong1/autoMate"></a> | AI-Powered Local Automation Tool & Let Your Computer Work for You. |
| [Agent-S](https://github.com/simular-ai/Agent-S) <a href="https://github.com/simular-ai/Agent-S"><img src="https://img.shields.io/github/stars/simular-ai/Agent-S"></a> | A Compositional Generalist-Specialist Framework for Computer Use Agents. |
| [pdf-craft](https://github.com/oomol-lab/pdf-craft) <a href="https://github.com/oomol-lab/pdf-craft"><img src="https://img.shields.io/github/stars/oomol-lab/pdf-craft"></a> | PDF Craft can convert PDF files into various other formats. |
| [VV](https://github.com/Cicada000/VV) <a href="https://github.com/Cicada000/VV"><img src="https://img.shields.io/github/stars/Cicada000/VV"></a> | Zhang Weiwei Quotations Search Project. |
| [docetl](https://github.com/ucbepic/docetl) <a href="https://github.com/ucbepic/docetl"><img src="https://img.shields.io/github/stars/ucbepic/docetl"></a> | DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. |
| [ZenlessZoneZero-Auto](https://github.com/sMythicalBird/ZenlessZoneZero-Auto) <a href="https://github.com/sMythicalBird/ZenlessZoneZero-Auto"><img src="https://img.shields.io/github/stars/sMythicalBird/ZenlessZoneZero-Auto"></a> | Zenless Zone Zero Automation Framework. |
| [Yuxi-Know](https://github.com/xerrors/Yuxi-Know) <a href="https://github.com/xerrors/Yuxi-Know"><img src="https://img.shields.io/github/stars/xerrors/Yuxi-Know"></a> | Knowledge graph question answering system based on LLMs. |
| [PaddleSharp](https://github.com/sdcb/PaddleSharp) <a href="https://github.com/sdcb/PaddleSharp"><img src="https://img.shields.io/github/stars/sdcb/PaddleSharp"></a>|.NET/C# binding for Baidu paddle inference library and PaddleOCR |
| [python-office](https://github.com/CoderWanFeng/python-office) <a href="https://github.com/CoderWanFeng/python-office"><img src="https://img.shields.io/github/stars/CoderWanFeng/python-office"></a> | Python tool for office works. |
| [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR) <a href="https://github.com/jingsongliujing/OnnxOCR"><img src="https://img.shields.io/github/stars/jingsongliujing/OnnxOCR"></a>|A lightweight OCR system based on PaddleOCR, decoupled from the PaddlePaddle deep learning training framework, with ultra-fast inference speed |
| ... |... |

View File

@ -0,0 +1,2 @@
*.html linguist-language=python
*.ipynb linguist-language=python

View File

@ -0,0 +1,16 @@
.DS_Store
*.pth
*.pyc
*.pyo
*.log
*.tmp
*.pkl
__pycache__/
.idea/
output/
test/*.jpg
datasets/
index/
train_log/
log/
profiling_log/

View File

@ -0,0 +1,201 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

View File

@ -0,0 +1,132 @@
# Real-time Scene Text Detection with Differentiable Binarization
**note**: some code is inherited from [WenmuZhou/DBNet.pytorch](https://github.com/WenmuZhou/DBNet.pytorch)
[中文解读](https://zhuanlan.zhihu.com/p/94677957)
![network](imgs/paper/db.jpg)
## update
2020-06-07: 添加灰度图训练,训练灰度图时需要在配置里移除`dataset.args.transforms.Normalize`
## Install Using Conda
```
conda env create -f environment.yml
git clone https://github.com/WenmuZhou/DBNet.paddle.git
cd DBNet.paddle/
```
or
## Install Manually
```bash
conda create -n dbnet python=3.6
conda activate dbnet
conda install ipython pip
# python dependencies
pip install -r requirement.txt
# clone repo
git clone https://github.com/WenmuZhou/DBNet.paddle.git
cd DBNet.paddle/
```
## Requirements
* paddlepaddle 2.4+
## Download
TBD
## Data Preparation
Training data: prepare a text `train.txt` in the following format, use '\t' as a separator
```
./datasets/train/img/001.jpg ./datasets/train/gt/001.txt
```
Validation data: prepare a text `test.txt` in the following format, use '\t' as a separator
```
./datasets/test/img/001.jpg ./datasets/test/gt/001.txt
```
- Store images in the `img` folder
- Store groundtruth in the `gt` folder
The groundtruth can be `.txt` files, with the following format:
```
x1, y1, x2, y2, x3, y3, x4, y4, annotation
```
## Train
1. config the `dataset['train']['dataset'['data_path']'`,`dataset['validate']['dataset'['data_path']`in [config/icdar2015_resnet18_fpn_DBhead_polyLR.yaml](cconfig/icdar2015_resnet18_fpn_DBhead_polyLR.yaml)
* . single gpu train
```bash
bash single_gpu_train.sh
```
* . Multi-gpu training
```bash
bash multi_gpu_train.sh
```
## Test
[eval.py](tools/eval.py) is used to test model on test dataset
1. config `model_path` in [eval.sh](eval.sh)
2. use following script to test
```bash
bash eval.sh
```
## Predict
[predict.py](tools/predict.py) Can be used to inference on all images in a folder
1. config `model_path`,`input_folder`,`output_folder` in [predict.sh](predict.sh)
2. use following script to predict
```
bash predict.sh
```
You can change the `model_path` in the `predict.sh` file to your model location.
tips: if result is not good, you can change `thre` in [predict.sh](predict.sh)
## Export Model
[export_model.py](tools/export_model.py) Can be used to inference on all images in a folder
use following script to export inference model
```
python tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.resume_checkpoint=model_best.pth trainer.output_dir=output/infer
```
## Paddle Inference infer
[infer.py](tools/infer.py) Can be used to inference on all images in a folder
use following script to export inference model
```
python tools/infer.py --model-dir=output/infer/ --img-path imgs/paper/db.jpg
```
<h2 id="Performance">Performance</h2>
### [ICDAR 2015](http://rrc.cvc.uab.es/?ch=4)
only train on ICDAR2015 dataset
| Method | image size (short size) |learning rate | Precision (%) | Recall (%) | F-measure (%) | FPS |
|:--------------------------:|:-------:|:--------:|:--------:|:------------:|:---------------:|:-----:|
| ImageNet-resnet50-FPN-DBHeadtorch |736 |1e-3|90.19 | 78.14 | 83.88 | 27 |
| ImageNet-resnet50-FPN-DBHeadpaddle |736 |1e-3| 89.47 | 79.03 | 83.92 | 27 |
| ImageNet-resnet50-FPN-DBHeadpaddle_amp |736 |1e-3| 88.62 | 79.95 | 84.06 | 27 |
### examples
TBD
### reference
1. https://arxiv.org/pdf/1911.08947.pdf
2. https://github.com/WenmuZhou/DBNet.pytorch
**If this repository helps youplease star it. Thanks.**

View File

@ -0,0 +1,2 @@
from .base_trainer import BaseTrainer
from .base_dataset import BaseDataSet

View File

@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 13:12
# @Author : zhoujun
import copy
from paddle.io import Dataset
from data_loader.modules import *
class BaseDataSet(Dataset):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
ignore_tags,
transform=None,
target_transform=None,
):
assert img_mode in ["RGB", "BRG", "GRAY"]
self.ignore_tags = ignore_tags
self.data_list = self.load_data(data_path)
item_keys = ["img_path", "img_name", "text_polys", "texts", "ignore_tags"]
for item in item_keys:
assert (
item in self.data_list[0]
), "data_list from load_data must contains {}".format(item_keys)
self.img_mode = img_mode
self.filter_keys = filter_keys
self.transform = transform
self.target_transform = target_transform
self._init_pre_processes(pre_processes)
def _init_pre_processes(self, pre_processes):
self.aug = []
if pre_processes is not None:
for aug in pre_processes:
if "args" not in aug:
args = {}
else:
args = aug["args"]
if isinstance(args, dict):
cls = eval(aug["type"])(**args)
else:
cls = eval(aug["type"])(args)
self.aug.append(cls)
def load_data(self, data_path: str) -> list:
"""
把数据加载为一个list
:params data_path: 存储数据的文件夹或者文件
return a dict ,包含了'img_path','img_name','text_polys','texts','ignore_tags'
"""
raise NotImplementedError
def apply_pre_processes(self, data):
for aug in self.aug:
data = aug(data)
return data
def __getitem__(self, index):
try:
data = copy.deepcopy(self.data_list[index])
im = cv2.imread(data["img_path"], 1 if self.img_mode != "GRAY" else 0)
if self.img_mode == "RGB":
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
data["img"] = im
data["shape"] = [im.shape[0], im.shape[1]]
data = self.apply_pre_processes(data)
if self.transform:
data["img"] = self.transform(data["img"])
data["text_polys"] = data["text_polys"].tolist()
if len(self.filter_keys):
data_dict = {}
for k, v in data.items():
if k not in self.filter_keys:
data_dict[k] = v
return data_dict
else:
return data
except:
return self.__getitem__(np.random.randint(self.__len__()))
def __len__(self):
return len(self.data_list)

View File

@ -0,0 +1,269 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:50
# @Author : zhoujun
import os
import pathlib
import shutil
from pprint import pformat
import anyconfig
import paddle
import numpy as np
import random
from paddle.jit import to_static
from paddle.static import InputSpec
from utils import setup_logger
class BaseTrainer:
def __init__(
self,
config,
model,
criterion,
train_loader,
validate_loader,
metric_cls,
post_process=None,
):
config["trainer"]["output_dir"] = os.path.join(
str(pathlib.Path(os.path.abspath(__name__)).parent),
config["trainer"]["output_dir"],
)
config["name"] = config["name"] + "_" + model.name
self.save_dir = config["trainer"]["output_dir"]
self.checkpoint_dir = os.path.join(self.save_dir, "checkpoint")
os.makedirs(self.checkpoint_dir, exist_ok=True)
self.global_step = 0
self.start_epoch = 0
self.config = config
self.criterion = criterion
# logger and tensorboard
self.visualdl_enable = self.config["trainer"].get("visual_dl", False)
self.epochs = self.config["trainer"]["epochs"]
self.log_iter = self.config["trainer"]["log_iter"]
if paddle.distributed.get_rank() == 0:
anyconfig.dump(config, os.path.join(self.save_dir, "config.yaml"))
self.logger = setup_logger(os.path.join(self.save_dir, "train.log"))
self.logger_info(pformat(self.config))
self.model = self.apply_to_static(model)
# device
if (
paddle.device.cuda.device_count() > 0
and paddle.device.is_compiled_with_cuda()
):
self.with_cuda = True
random.seed(self.config["trainer"]["seed"])
np.random.seed(self.config["trainer"]["seed"])
paddle.seed(self.config["trainer"]["seed"])
else:
self.with_cuda = False
self.logger_info("train with and paddle {}".format(paddle.__version__))
# metrics
self.metrics = {
"recall": 0,
"precision": 0,
"hmean": 0,
"train_loss": float("inf"),
"best_model_epoch": 0,
}
self.train_loader = train_loader
if validate_loader is not None:
assert post_process is not None and metric_cls is not None
self.validate_loader = validate_loader
self.post_process = post_process
self.metric_cls = metric_cls
self.train_loader_len = len(train_loader)
if self.validate_loader is not None:
self.logger_info(
"train dataset has {} samples,{} in dataloader, validate dataset has {} samples,{} in dataloader".format(
len(self.train_loader.dataset),
self.train_loader_len,
len(self.validate_loader.dataset),
len(self.validate_loader),
)
)
else:
self.logger_info(
"train dataset has {} samples,{} in dataloader".format(
len(self.train_loader.dataset), self.train_loader_len
)
)
self._initialize_scheduler()
self._initialize_optimizer()
# resume or finetune
if self.config["trainer"]["resume_checkpoint"] != "":
self._load_checkpoint(
self.config["trainer"]["resume_checkpoint"], resume=True
)
elif self.config["trainer"]["finetune_checkpoint"] != "":
self._load_checkpoint(
self.config["trainer"]["finetune_checkpoint"], resume=False
)
if self.visualdl_enable and paddle.distributed.get_rank() == 0:
from visualdl import LogWriter
self.writer = LogWriter(self.save_dir)
# 混合精度训练
self.amp = self.config.get("amp", None)
if self.amp == "None":
self.amp = None
if self.amp:
self.amp["scaler"] = paddle.amp.GradScaler(
init_loss_scaling=self.amp.get("scale_loss", 1024),
use_dynamic_loss_scaling=self.amp.get("use_dynamic_loss_scaling", True),
)
self.model, self.optimizer = paddle.amp.decorate(
models=self.model,
optimizers=self.optimizer,
level=self.amp.get("amp_level", "O2"),
)
# 分布式训练
if paddle.device.cuda.device_count() > 1:
self.model = paddle.DataParallel(self.model)
# make inverse Normalize
self.UN_Normalize = False
for t in self.config["dataset"]["train"]["dataset"]["args"]["transforms"]:
if t["type"] == "Normalize":
self.normalize_mean = t["args"]["mean"]
self.normalize_std = t["args"]["std"]
self.UN_Normalize = True
def apply_to_static(self, model):
support_to_static = self.config["trainer"].get("to_static", False)
if support_to_static:
specs = None
print("static")
specs = [InputSpec([None, 3, -1, -1])]
model = to_static(model, input_spec=specs)
self.logger_info(
"Successfully to apply @to_static with specs: {}".format(specs)
)
return model
def train(self):
"""
Full training logic
"""
for epoch in range(self.start_epoch + 1, self.epochs + 1):
self.epoch_result = self._train_epoch(epoch)
self._on_epoch_finish()
if paddle.distributed.get_rank() == 0 and self.visualdl_enable:
self.writer.close()
self._on_train_finish()
def _train_epoch(self, epoch):
"""
Training logic for an epoch
:param epoch: Current epoch number
"""
raise NotImplementedError
def _eval(self, epoch):
"""
eval logic for an epoch
:param epoch: Current epoch number
"""
raise NotImplementedError
def _on_epoch_finish(self):
raise NotImplementedError
def _on_train_finish(self):
raise NotImplementedError
def _save_checkpoint(self, epoch, file_name):
"""
Saving checkpoints
:param epoch: current epoch number
:param log: logging information of the epoch
:param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar'
"""
state_dict = self.model.state_dict()
state = {
"epoch": epoch,
"global_step": self.global_step,
"state_dict": state_dict,
"optimizer": self.optimizer.state_dict(),
"config": self.config,
"metrics": self.metrics,
}
filename = os.path.join(self.checkpoint_dir, file_name)
paddle.save(state, filename)
def _load_checkpoint(self, checkpoint_path, resume):
"""
Resume from saved checkpoints
:param checkpoint_path: Checkpoint path to be resumed
"""
self.logger_info("Loading checkpoint: {} ...".format(checkpoint_path))
checkpoint = paddle.load(checkpoint_path)
self.model.set_state_dict(checkpoint["state_dict"])
if resume:
self.global_step = checkpoint["global_step"]
self.start_epoch = checkpoint["epoch"]
self.config["lr_scheduler"]["args"]["last_epoch"] = self.start_epoch
# self.scheduler.load_state_dict(checkpoint['scheduler'])
self.optimizer.set_state_dict(checkpoint["optimizer"])
if "metrics" in checkpoint:
self.metrics = checkpoint["metrics"]
self.logger_info(
"resume from checkpoint {} (epoch {})".format(
checkpoint_path, self.start_epoch
)
)
else:
self.logger_info("finetune from checkpoint {}".format(checkpoint_path))
def _initialize(self, name, module, *args, **kwargs):
module_name = self.config[name]["type"]
module_args = self.config[name].get("args", {})
assert all(
[k not in module_args for k in kwargs]
), "Overwriting kwargs given in config file is not allowed"
module_args.update(kwargs)
return getattr(module, module_name)(*args, **module_args)
def _initialize_scheduler(self):
self.lr_scheduler = self._initialize("lr_scheduler", paddle.optimizer.lr)
def _initialize_optimizer(self):
self.optimizer = self._initialize(
"optimizer",
paddle.optimizer,
parameters=self.model.parameters(),
learning_rate=self.lr_scheduler,
)
def inverse_normalize(self, batch_img):
if self.UN_Normalize:
batch_img[:, 0, :, :] = (
batch_img[:, 0, :, :] * self.normalize_std[0] + self.normalize_mean[0]
)
batch_img[:, 1, :, :] = (
batch_img[:, 1, :, :] * self.normalize_std[1] + self.normalize_mean[1]
)
batch_img[:, 2, :, :] = (
batch_img[:, 2, :, :] * self.normalize_std[2] + self.normalize_mean[2]
)
def logger_info(self, s):
if paddle.distributed.get_rank() == 0:
self.logger.info(s)

View File

@ -0,0 +1,40 @@
name: DBNet
dataset:
train:
dataset:
type: SynthTextDataset # 数据集类型
args:
data_path: ''# SynthTextDataset 根目录
pre_processes: # 数据的预处理过程包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: ['img_path','img_name','text_polys','texts','ignore_tags','shape'] # 返回数据之前从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''

View File

@ -0,0 +1,65 @@
name: DBNet
base: ['config/SynthText.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path: ./datasets/SynthText
img_mode: RGB
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''

View File

@ -0,0 +1,69 @@
name: DBNet
dataset:
train:
dataset:
type: ICDAR2015Dataset # 数据集类型
args:
data_path: # 一个存放 img_path \t gt_path的文件
- ''
pre_processes: # 数据的预处理过程包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''
validate:
dataset:
type: ICDAR2015Dataset
args:
data_path:
- ''
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
transforms:
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
filter_keys: []
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,82 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: deformable_resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,82 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,83 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: StepLR
args:
step_size: 10
gama: 0.8
trainer:
seed: 2
epochs: 500
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,79 @@
name: DBNet
base: ['config/icdar2015.yaml']
arch:
type: Model
backbone:
type: resnet50
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
lr_scheduler:
type: Polynomial
args:
learning_rate: 0.001
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 10
show_images_iter: 50
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output/fp16_o2
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.txt
img_mode: RGB
loader:
batch_size: 16
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.txt
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,73 @@
name: DBNet
dataset:
train:
dataset:
type: DetDataset # 数据集类型
args:
data_path: # 一个存放 img_path \t gt_path的文件
- ''
pre_processes: # 数据的预处理过程包含augment和标签制作
- type: IaaAugment # 使用imgaug进行变换
args:
- {'type':Fliplr, 'args':{'p':0.5}}
- {'type': Affine, 'args':{'rotate':[-10,10]}}
- {'type':Resize,'args':{'size':[0.5,3]}}
- type: EastRandomCropData
args:
size: [640,640]
max_tries: 50
keep_ratio: true
- type: MakeBorderMap
args:
shrink_ratio: 0.4
thresh_min: 0.3
thresh_max: 0.7
- type: MakeShrinkMap
args:
shrink_ratio: 0.4
min_text_size: 8
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
load_char_annotation: false
expand_one_char: false
filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前从数据字典里删除的key
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ''
validate:
dataset:
type: DetDataset
args:
data_path:
- ''
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
transforms:
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
load_char_annotation: false # 是否加载字符级标注
expand_one_char: false # 是否对只有一个字符的框进行宽度扩充扩充后w = w+h
filter_keys: []
ignore_tags: ['*', '###']
loader:
batch_size: 1
shuffle: true
num_workers: 0
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,86 @@
name: DBNet
base: ['config/open_dataset.yaml']
arch:
type: Model
backbone:
type: deformable_resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 1
show_images_iter: 1
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.json
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.json
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,86 @@
name: DBNet
base: ['config/open_dataset.yaml']
arch:
type: Model
backbone:
type: resnest50
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 1
show_images_iter: 1
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.json
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.json
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,93 @@
name: DBNet
base: ['config/open_dataset.yaml']
arch:
type: Model
backbone:
type: resnet18
pretrained: true
neck:
type: FPN
inner_channels: 256
head:
type: DBHead
out_channels: 2
k: 50
post_processing:
type: SegDetectorRepresenter
args:
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5 # from paper
metric:
type: QuadMetric
args:
is_output_polygon: false
loss:
type: DBLoss
alpha: 1
beta: 10
ohem_ratio: 3
optimizer:
type: Adam
args:
lr: 0.001
weight_decay: 0
amsgrad: true
lr_scheduler:
type: WarmupPolyLR
args:
warmup_epoch: 3
trainer:
seed: 2
epochs: 1200
log_iter: 1
show_images_iter: 1
resume_checkpoint: ''
finetune_checkpoint: ''
output_dir: output
visual_dl: false
amp:
scale_loss: 1024
amp_level: O2
custom_white_list: []
custom_black_list: ['exp', 'sigmoid', 'concat']
dataset:
train:
dataset:
args:
data_path:
- ./datasets/train.json
transforms: # 对图片进行的变换方式
- type: ToTensor
args: {}
- type: Normalize
args:
mean: [0.485, 0.456, 0.406]
std: [0.229, 0.224, 0.225]
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 2
shuffle: true
num_workers: 6
collate_fn: ''
validate:
dataset:
args:
data_path:
- ./datasets/test.json
pre_processes:
- type: ResizeShortSize
args:
short_size: 736
resize_text_polys: false
img_mode: RGB
load_char_annotation: false
expand_one_char: false
loader:
batch_size: 1
shuffle: true
num_workers: 6
collate_fn: ICDARCollectFN

View File

@ -0,0 +1,114 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:52
# @Author : zhoujun
import copy
import PIL
import numpy as np
import paddle
from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
from paddle.vision import transforms
def get_dataset(data_path, module_name, transform, dataset_args):
"""
获取训练dataset
:param data_path: dataset文件列表每个文件内以如下格式存储 path/to/img\tlabel
:param module_name: 所使用的自定义dataset名称目前只支持data_loaders.ImageDataset
:param transform: 该数据集使用的transforms
:param dataset_args: module_name的参数
:return: 如果data_path列表不为空返回对于的ConcatDataset对象否则None
"""
from . import dataset
s_dataset = getattr(dataset, module_name)(
transform=transform, data_path=data_path, **dataset_args
)
return s_dataset
def get_transforms(transforms_config):
tr_list = []
for item in transforms_config:
if "args" not in item:
args = {}
else:
args = item["args"]
cls = getattr(transforms, item["type"])(**args)
tr_list.append(cls)
tr_list = transforms.Compose(tr_list)
return tr_list
class ICDARCollectFN:
def __init__(self, *args, **kwargs):
pass
def __call__(self, batch):
data_dict = {}
to_tensor_keys = []
for sample in batch:
for k, v in sample.items():
if k not in data_dict:
data_dict[k] = []
if isinstance(v, (np.ndarray, paddle.Tensor, PIL.Image.Image)):
if k not in to_tensor_keys:
to_tensor_keys.append(k)
data_dict[k].append(v)
for k in to_tensor_keys:
data_dict[k] = paddle.stack(data_dict[k], 0)
return data_dict
def get_dataloader(module_config, distributed=False):
if module_config is None:
return None
config = copy.deepcopy(module_config)
dataset_args = config["dataset"]["args"]
if "transforms" in dataset_args:
img_transforms = get_transforms(dataset_args.pop("transforms"))
else:
img_transforms = None
# 创建数据集
dataset_name = config["dataset"]["type"]
data_path = dataset_args.pop("data_path")
if data_path == None:
return None
data_path = [x for x in data_path if x is not None]
if len(data_path) == 0:
return None
if (
"collate_fn" not in config["loader"]
or config["loader"]["collate_fn"] is None
or len(config["loader"]["collate_fn"]) == 0
):
config["loader"]["collate_fn"] = None
else:
config["loader"]["collate_fn"] = eval(config["loader"]["collate_fn"])()
_dataset = get_dataset(
data_path=data_path,
module_name=dataset_name,
transform=img_transforms,
dataset_args=dataset_args,
)
sampler = None
if distributed:
# 3使用DistributedSampler
batch_sampler = DistributedBatchSampler(
dataset=_dataset,
batch_size=config["loader"].pop("batch_size"),
shuffle=config["loader"].pop("shuffle"),
)
else:
batch_sampler = BatchSampler(
dataset=_dataset,
batch_size=config["loader"].pop("batch_size"),
shuffle=config["loader"].pop("shuffle"),
)
loader = DataLoader(
dataset=_dataset, batch_sampler=batch_sampler, **config["loader"]
)
return loader

View File

@ -0,0 +1,190 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:54
# @Author : zhoujun
import pathlib
import os
import cv2
import numpy as np
import scipy.io as sio
from tqdm.auto import tqdm
from base import BaseDataSet
from utils import order_points_clockwise, get_datalist, load, expand_polygon
class ICDAR2015Dataset(BaseDataSet):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
ignore_tags,
transform=None,
**kwargs,
):
super().__init__(
data_path, img_mode, pre_processes, filter_keys, ignore_tags, transform
)
def load_data(self, data_path: str) -> list:
data_list = get_datalist(data_path)
t_data_list = []
for img_path, label_path in data_list:
data = self._get_annotation(label_path)
if len(data["text_polys"]) > 0:
item = {"img_path": img_path, "img_name": pathlib.Path(img_path).stem}
item.update(data)
t_data_list.append(item)
else:
print("there is no suit bbox in {}".format(label_path))
return t_data_list
def _get_annotation(self, label_path: str) -> dict:
boxes = []
texts = []
ignores = []
with open(label_path, encoding="utf-8", mode="r") as f:
for line in f.readlines():
params = line.strip().strip("\ufeff").strip("\xef\xbb\xbf").split(",")
try:
box = order_points_clockwise(
np.array(list(map(float, params[:8]))).reshape(-1, 2)
)
if cv2.contourArea(box) > 0:
boxes.append(box)
label = params[8]
texts.append(label)
ignores.append(label in self.ignore_tags)
except:
print("load label failed on {}".format(label_path))
data = {
"text_polys": np.array(boxes),
"texts": texts,
"ignore_tags": ignores,
}
return data
class DetDataset(BaseDataSet):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
ignore_tags,
transform=None,
**kwargs,
):
self.load_char_annotation = kwargs["load_char_annotation"]
self.expand_one_char = kwargs["expand_one_char"]
super().__init__(
data_path, img_mode, pre_processes, filter_keys, ignore_tags, transform
)
def load_data(self, data_path: str) -> list:
"""
从json文件中读取出 文本行的坐标和gt字符的坐标和gt
:param data_path:
:return:
"""
data_list = []
for path in data_path:
content = load(path)
for gt in tqdm(content["data_list"], desc="read file {}".format(path)):
img_path = os.path.join(content["data_root"], gt["img_name"])
polygons = []
texts = []
illegibility_list = []
language_list = []
for annotation in gt["annotations"]:
if len(annotation["polygon"]) == 0 or len(annotation["text"]) == 0:
continue
if len(annotation["text"]) > 1 and self.expand_one_char:
annotation["polygon"] = expand_polygon(annotation["polygon"])
polygons.append(annotation["polygon"])
texts.append(annotation["text"])
illegibility_list.append(annotation["illegibility"])
language_list.append(annotation["language"])
if self.load_char_annotation:
for char_annotation in annotation["chars"]:
if (
len(char_annotation["polygon"]) == 0
or len(char_annotation["char"]) == 0
):
continue
polygons.append(char_annotation["polygon"])
texts.append(char_annotation["char"])
illegibility_list.append(char_annotation["illegibility"])
language_list.append(char_annotation["language"])
data_list.append(
{
"img_path": img_path,
"img_name": gt["img_name"],
"text_polys": np.array(polygons),
"texts": texts,
"ignore_tags": illegibility_list,
}
)
return data_list
class SynthTextDataset(BaseDataSet):
def __init__(
self,
data_path: str,
img_mode,
pre_processes,
filter_keys,
transform=None,
**kwargs,
):
self.transform = transform
self.dataRoot = pathlib.Path(data_path)
if not self.dataRoot.exists():
raise FileNotFoundError("Dataset folder is not exist.")
self.targetFilePath = self.dataRoot / "gt.mat"
if not self.targetFilePath.exists():
raise FileExistsError("Target file is not exist.")
targets = {}
sio.loadmat(
self.targetFilePath,
targets,
squeeze_me=True,
struct_as_record=False,
variable_names=["imnames", "wordBB", "txt"],
)
self.imageNames = targets["imnames"]
self.wordBBoxes = targets["wordBB"]
self.transcripts = targets["txt"]
super().__init__(data_path, img_mode, pre_processes, filter_keys, transform)
def load_data(self, data_path: str) -> list:
t_data_list = []
for imageName, wordBBoxes, texts in zip(
self.imageNames, self.wordBBoxes, self.transcripts
):
item = {}
wordBBoxes = (
np.expand_dims(wordBBoxes, axis=2)
if (wordBBoxes.ndim == 2)
else wordBBoxes
)
_, _, numOfWords = wordBBoxes.shape
text_polys = wordBBoxes.reshape(
[8, numOfWords], order="F"
).T # num_words * 8
text_polys = text_polys.reshape(numOfWords, 4, 2) # num_of_words * 4 * 2
transcripts = [word for line in texts for word in line.split()]
if numOfWords != len(transcripts):
continue
item["img_path"] = str(self.dataRoot / imageName)
item["img_name"] = (self.dataRoot / imageName).stem
item["text_polys"] = text_polys
item["texts"] = transcripts
item["ignore_tags"] = [x in self.ignore_tags for x in transcripts]
t_data_list.append(item)
return t_data_list

View File

@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 10:53
# @Author : zhoujun
from .iaa_augment import IaaAugment
from .augment import *
from .random_crop_data import EastRandomCropData, PSERandomCrop
from .make_border_map import MakeBorderMap
from .make_shrink_map import MakeShrinkMap

View File

@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:52
# @Author : zhoujun
import math
import numbers
import random
import cv2
import numpy as np
from skimage.util import random_noise
class RandomNoise:
def __init__(self, random_rate):
self.random_rate = random_rate
def __call__(self, data: dict):
"""
对图片加噪声
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
data["img"] = (
random_noise(data["img"], mode="gaussian", clip=True) * 255
).astype(data["img"].dtype)
return data
class RandomScale:
def __init__(self, scales, random_rate):
"""
:param scales: 尺度
:param random_rate: 随机系数
:return:
"""
self.random_rate = random_rate
self.scales = scales
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
tmp_text_polys = text_polys.copy()
rd_scale = float(np.random.choice(self.scales))
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
tmp_text_polys *= rd_scale
data["img"] = im
data["text_polys"] = tmp_text_polys
return data
class RandomRotateImgBox:
def __init__(self, degrees, random_rate, same_size=False):
"""
:param degrees: 角度可以是一个数值或者list
:param random_rate: 随机系数
:param same_size: 是否保持和原图一样大
:return:
"""
if isinstance(degrees, numbers.Number):
if degrees < 0:
raise ValueError("If degrees is a single number, it must be positive.")
degrees = (-degrees, degrees)
elif (
isinstance(degrees, list)
or isinstance(degrees, tuple)
or isinstance(degrees, np.ndarray)
):
if len(degrees) != 2:
raise ValueError("If degrees is a sequence, it must be of len 2.")
degrees = degrees
else:
raise Exception("degrees must in Number or list or tuple or np.ndarray")
self.degrees = degrees
self.same_size = same_size
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
# ---------------------- 旋转图像 ----------------------
w = im.shape[1]
h = im.shape[0]
angle = np.random.uniform(self.degrees[0], self.degrees[1])
if self.same_size:
nw = w
nh = h
else:
# 角度变弧度
rangle = np.deg2rad(angle)
# 计算旋转之后图像的w, h
nw = abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w)
nh = abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w)
# 构造仿射矩阵
rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, 1)
# 计算原图中心点到新图中心点的偏移量
rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
# 更新仿射矩阵
rot_mat[0, 2] += rot_move[0]
rot_mat[1, 2] += rot_move[1]
# 仿射变换
rot_img = cv2.warpAffine(
im,
rot_mat,
(int(math.ceil(nw)), int(math.ceil(nh))),
flags=cv2.INTER_LANCZOS4,
)
# ---------------------- 矫正bbox坐标 ----------------------
# rot_mat是最终的旋转矩阵
# 获取原始bbox的四个中点然后将这四个点转换到旋转后的坐标系下
rot_text_polys = list()
for bbox in text_polys:
point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
rot_text_polys.append([point1, point2, point3, point4])
data["img"] = rot_img
data["text_polys"] = np.array(rot_text_polys)
return data
class RandomResize:
def __init__(self, size, random_rate, keep_ratio=False):
"""
:param input_size: resize尺寸,数字或者list的形式如果为list形式就是[w,h]
:param random_rate: 随机系数
:param keep_ratio: 是否保持长宽比
:return:
"""
if isinstance(size, numbers.Number):
if size < 0:
raise ValueError(
"If input_size is a single number, it must be positive."
)
size = (size, size)
elif (
isinstance(size, list)
or isinstance(size, tuple)
or isinstance(size, np.ndarray)
):
if len(size) != 2:
raise ValueError("If input_size is a sequence, it must be of len 2.")
size = (size[0], size[1])
else:
raise Exception("input_size must in Number or list or tuple or np.ndarray")
self.size = size
self.keep_ratio = keep_ratio
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
if self.keep_ratio:
# 将图片短边pad到和长边一样
h, w, c = im.shape
max_h = max(h, self.size[0])
max_w = max(w, self.size[1])
im_padded = np.zeros((max_h, max_w, c), dtype=np.uint8)
im_padded[:h, :w] = im.copy()
im = im_padded
text_polys = text_polys.astype(np.float32)
h, w, _ = im.shape
im = cv2.resize(im, self.size)
w_scale = self.size[0] / float(w)
h_scale = self.size[1] / float(h)
text_polys[:, :, 0] *= w_scale
text_polys[:, :, 1] *= h_scale
data["img"] = im
data["text_polys"] = text_polys
return data
def resize_image(img, short_size):
height, width, _ = img.shape
if height < width:
new_height = short_size
new_width = new_height / height * width
else:
new_width = short_size
new_height = new_width / width * height
new_height = int(round(new_height / 32) * 32)
new_width = int(round(new_width / 32) * 32)
resized_img = cv2.resize(img, (new_width, new_height))
return resized_img, (new_width / width, new_height / height)
class ResizeShortSize:
def __init__(self, short_size, resize_text_polys=True):
"""
:param size: resize尺寸,数字或者list的形式如果为list形式就是[w,h]
:return:
"""
self.short_size = short_size
self.resize_text_polys = resize_text_polys
def __call__(self, data: dict) -> dict:
"""
对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
im = data["img"]
text_polys = data["text_polys"]
h, w, _ = im.shape
short_edge = min(h, w)
if short_edge < self.short_size:
# 保证短边 >= short_size
scale = self.short_size / short_edge
im = cv2.resize(im, dsize=None, fx=scale, fy=scale)
scale = (scale, scale)
# im, scale = resize_image(im, self.short_size)
if self.resize_text_polys:
# text_polys *= scale
text_polys[:, 0] *= scale[0]
text_polys[:, 1] *= scale[1]
data["img"] = im
data["text_polys"] = text_polys
return data
class HorizontalFlip:
def __init__(self, random_rate):
"""
:param random_rate: 随机系数
"""
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
flip_text_polys = text_polys.copy()
flip_im = cv2.flip(im, 1)
h, w, _ = flip_im.shape
flip_text_polys[:, :, 0] = w - flip_text_polys[:, :, 0]
data["img"] = flip_im
data["text_polys"] = flip_text_polys
return data
class VerticalFlip:
def __init__(self, random_rate):
"""
:param random_rate: 随机系数
"""
self.random_rate = random_rate
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
if random.random() > self.random_rate:
return data
im = data["img"]
text_polys = data["text_polys"]
flip_text_polys = text_polys.copy()
flip_im = cv2.flip(im, 0)
h, w, _ = flip_im.shape
flip_text_polys[:, :, 1] = h - flip_text_polys[:, :, 1]
data["img"] = flip_im
data["text_polys"] = flip_text_polys
return data

View File

@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 18:06
# @Author : zhoujun
import numpy as np
import imgaug
import imgaug.augmenters as iaa
class AugmenterBuilder(object):
def __init__(self):
pass
def build(self, args, root=True):
if args is None or len(args) == 0:
return None
elif isinstance(args, list):
if root:
sequence = [self.build(value, root=False) for value in args]
return iaa.Sequential(sequence)
else:
return getattr(iaa, args[0])(
*[self.to_tuple_if_list(a) for a in args[1:]]
)
elif isinstance(args, dict):
cls = getattr(iaa, args["type"])
return cls(**{k: self.to_tuple_if_list(v) for k, v in args["args"].items()})
else:
raise RuntimeError("unknown augmenter arg: " + str(args))
def to_tuple_if_list(self, obj):
if isinstance(obj, list):
return tuple(obj)
return obj
class IaaAugment:
def __init__(self, augmenter_args):
self.augmenter_args = augmenter_args
self.augmenter = AugmenterBuilder().build(self.augmenter_args)
def __call__(self, data):
image = data["img"]
shape = image.shape
if self.augmenter:
aug = self.augmenter.to_deterministic()
data["img"] = aug.augment_image(image)
data = self.may_augment_annotation(aug, data, shape)
return data
def may_augment_annotation(self, aug, data, shape):
if aug is None:
return data
line_polys = []
for poly in data["text_polys"]:
new_poly = self.may_augment_poly(aug, shape, poly)
line_polys.append(new_poly)
data["text_polys"] = np.array(line_polys)
return data
def may_augment_poly(self, aug, img_shape, poly):
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
keypoints = aug.augment_keypoints(
[imgaug.KeypointsOnImage(keypoints, shape=img_shape)]
)[0].keypoints
poly = [(p.x, p.y) for p in keypoints]
return poly

View File

@ -0,0 +1,159 @@
import cv2
import numpy as np
np.seterr(divide="ignore", invalid="ignore")
import pyclipper
from shapely.geometry import Polygon
class MakeBorderMap:
def __init__(self, shrink_ratio=0.4, thresh_min=0.3, thresh_max=0.7):
self.shrink_ratio = shrink_ratio
self.thresh_min = thresh_min
self.thresh_max = thresh_max
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
im = data["img"]
text_polys = data["text_polys"]
ignore_tags = data["ignore_tags"]
canvas = np.zeros(im.shape[:2], dtype=np.float32)
mask = np.zeros(im.shape[:2], dtype=np.float32)
for i in range(len(text_polys)):
if ignore_tags[i]:
continue
self.draw_border_map(text_polys[i], canvas, mask=mask)
canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
data["threshold_map"] = canvas
data["threshold_mask"] = mask
return data
def draw_border_map(self, polygon, canvas, mask):
polygon = np.array(polygon)
assert polygon.ndim == 2
assert polygon.shape[1] == 2
polygon_shape = Polygon(polygon)
if polygon_shape.area <= 0:
return
distance = (
polygon_shape.area
* (1 - np.power(self.shrink_ratio, 2))
/ polygon_shape.length
)
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
padded_polygon = np.array(padding.Execute(distance)[0])
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
xmin = padded_polygon[:, 0].min()
xmax = padded_polygon[:, 0].max()
ymin = padded_polygon[:, 1].min()
ymax = padded_polygon[:, 1].max()
width = xmax - xmin + 1
height = ymax - ymin + 1
polygon[:, 0] = polygon[:, 0] - xmin
polygon[:, 1] = polygon[:, 1] - ymin
xs = np.broadcast_to(
np.linspace(0, width - 1, num=width).reshape(1, width), (height, width)
)
ys = np.broadcast_to(
np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width)
)
distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
for i in range(polygon.shape[0]):
j = (i + 1) % polygon.shape[0]
absolute_distance = self.distance(xs, ys, polygon[i], polygon[j])
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
distance_map = distance_map.min(axis=0)
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1] = np.fmax(
1
- distance_map[
ymin_valid - ymin : ymax_valid - ymax + height,
xmin_valid - xmin : xmax_valid - xmax + width,
],
canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1],
)
def distance(self, xs, ys, point_1, point_2):
"""
compute the distance from point to a line
ys: coordinates in the first axis
xs: coordinates in the second axis
point_1, point_2: (x, y), the end of the line
"""
height, width = xs.shape[:2]
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
square_distance = np.square(point_1[0] - point_2[0]) + np.square(
point_1[1] - point_2[1]
)
cosin = (square_distance - square_distance_1 - square_distance_2) / (
2 * np.sqrt(square_distance_1 * square_distance_2)
)
square_sin = 1 - np.square(cosin)
square_sin = np.nan_to_num(square_sin)
result = np.sqrt(
square_distance_1 * square_distance_2 * square_sin / square_distance
)
result[cosin < 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[
cosin < 0
]
# self.extend_line(point_1, point_2, result)
return result
def extend_line(self, point_1, point_2, result):
ex_point_1 = (
int(
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + self.shrink_ratio))
),
int(
round(point_1[1] + (point_1[1] - point_2[1]) * (1 + self.shrink_ratio))
),
)
cv2.line(
result,
tuple(ex_point_1),
tuple(point_1),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0,
)
ex_point_2 = (
int(
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + self.shrink_ratio))
),
int(
round(point_2[1] + (point_2[1] - point_1[1]) * (1 + self.shrink_ratio))
),
)
cv2.line(
result,
tuple(ex_point_2),
tuple(point_2),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0,
)
return ex_point_1, ex_point_2

View File

@ -0,0 +1,129 @@
import numpy as np
import cv2
def shrink_polygon_py(polygon, shrink_ratio):
"""
对框进行缩放返回去的比例为1/shrink_ratio 即可
"""
cx = polygon[:, 0].mean()
cy = polygon[:, 1].mean()
polygon[:, 0] = cx + (polygon[:, 0] - cx) * shrink_ratio
polygon[:, 1] = cy + (polygon[:, 1] - cy) * shrink_ratio
return polygon
def shrink_polygon_pyclipper(polygon, shrink_ratio):
from shapely.geometry import Polygon
import pyclipper
polygon_shape = Polygon(polygon)
distance = (
polygon_shape.area * (1 - np.power(shrink_ratio, 2)) / polygon_shape.length
)
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
shrunk = padding.Execute(-distance)
if shrunk == []:
shrunk = np.array(shrunk)
else:
shrunk = np.array(shrunk[0]).reshape(-1, 2)
return shrunk
class MakeShrinkMap:
r"""
Making binary mask from detection data with ICDAR format.
Typically following the process of class `MakeICDARData`.
"""
def __init__(self, min_text_size=8, shrink_ratio=0.4, shrink_type="pyclipper"):
shrink_func_dict = {
"py": shrink_polygon_py,
"pyclipper": shrink_polygon_pyclipper,
}
self.shrink_func = shrink_func_dict[shrink_type]
self.min_text_size = min_text_size
self.shrink_ratio = shrink_ratio
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
image = data["img"]
text_polys = data["text_polys"]
ignore_tags = data["ignore_tags"]
h, w = image.shape[:2]
text_polys, ignore_tags = self.validate_polygons(text_polys, ignore_tags, h, w)
gt = np.zeros((h, w), dtype=np.float32)
mask = np.ones((h, w), dtype=np.float32)
for i in range(len(text_polys)):
polygon = text_polys[i]
height = max(polygon[:, 1]) - min(polygon[:, 1])
width = max(polygon[:, 0]) - min(polygon[:, 0])
if ignore_tags[i] or min(height, width) < self.min_text_size:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
else:
shrunk = self.shrink_func(polygon, self.shrink_ratio)
if shrunk.size == 0:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
continue
cv2.fillPoly(gt, [shrunk.astype(np.int32)], 1)
data["shrink_map"] = gt
data["shrink_mask"] = mask
return data
def validate_polygons(self, polygons, ignore_tags, h, w):
"""
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
"""
if len(polygons) == 0:
return polygons, ignore_tags
assert len(polygons) == len(ignore_tags)
for polygon in polygons:
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
for i in range(len(polygons)):
area = self.polygon_area(polygons[i])
if abs(area) < 1:
ignore_tags[i] = True
if area > 0:
polygons[i] = polygons[i][::-1, :]
return polygons, ignore_tags
def polygon_area(self, polygon):
return cv2.contourArea(polygon)
# edge = 0
# for i in range(polygon.shape[0]):
# next_index = (i + 1) % polygon.shape[0]
# edge += (polygon[next_index, 0] - polygon[i, 0]) * (polygon[next_index, 1] - polygon[i, 1])
#
# return edge / 2.
if __name__ == "__main__":
from shapely.geometry import Polygon
import pyclipper
polygon = np.array([[0, 0], [100, 10], [100, 100], [10, 90]])
a = shrink_polygon_py(polygon, 0.4)
print(a)
print(shrink_polygon_py(a, 1 / 0.4))
b = shrink_polygon_pyclipper(polygon, 0.4)
print(b)
poly = Polygon(b)
distance = poly.area * 1.5 / poly.length
offset = pyclipper.PyclipperOffset()
offset.AddPath(b, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
expanded = np.array(offset.Execute(distance))
bounding_box = cv2.minAreaRect(expanded)
points = cv2.boxPoints(bounding_box)
print(points)

View File

@ -0,0 +1,211 @@
import random
import cv2
import numpy as np
# random crop algorithm similar to https://github.com/argman/EAST
class EastRandomCropData:
def __init__(
self,
size=(640, 640),
max_tries=50,
min_crop_side_ratio=0.1,
require_original_image=False,
keep_ratio=True,
):
self.size = size
self.max_tries = max_tries
self.min_crop_side_ratio = min_crop_side_ratio
self.require_original_image = require_original_image
self.keep_ratio = keep_ratio
def __call__(self, data: dict) -> dict:
"""
从scales中随机选择一个尺度对图片和文本框进行缩放
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
:return:
"""
im = data["img"]
text_polys = data["text_polys"]
ignore_tags = data["ignore_tags"]
texts = data["texts"]
all_care_polys = [text_polys[i] for i, tag in enumerate(ignore_tags) if not tag]
# 计算crop区域
crop_x, crop_y, crop_w, crop_h = self.crop_area(im, all_care_polys)
# crop 图片 保持比例填充
scale_w = self.size[0] / crop_w
scale_h = self.size[1] / crop_h
scale = min(scale_w, scale_h)
h = int(crop_h * scale)
w = int(crop_w * scale)
if self.keep_ratio:
if len(im.shape) == 3:
padimg = np.zeros((self.size[1], self.size[0], im.shape[2]), im.dtype)
else:
padimg = np.zeros((self.size[1], self.size[0]), im.dtype)
padimg[:h, :w] = cv2.resize(
im[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w], (w, h)
)
img = padimg
else:
img = cv2.resize(
im[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w], tuple(self.size)
)
# crop 文本框
text_polys_crop = []
ignore_tags_crop = []
texts_crop = []
for poly, text, tag in zip(text_polys, texts, ignore_tags):
poly = ((poly - (crop_x, crop_y)) * scale).tolist()
if not self.is_poly_outside_rect(poly, 0, 0, w, h):
text_polys_crop.append(poly)
ignore_tags_crop.append(tag)
texts_crop.append(text)
data["img"] = img
data["text_polys"] = np.float32(text_polys_crop)
data["ignore_tags"] = ignore_tags_crop
data["texts"] = texts_crop
return data
def is_poly_in_rect(self, poly, x, y, w, h):
poly = np.array(poly)
if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
return False
if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
return False
return True
def is_poly_outside_rect(self, poly, x, y, w, h):
poly = np.array(poly)
if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
return True
if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
return True
return False
def split_regions(self, axis):
regions = []
min_axis = 0
for i in range(1, axis.shape[0]):
if axis[i] != axis[i - 1] + 1:
region = axis[min_axis:i]
min_axis = i
regions.append(region)
return regions
def random_select(self, axis, max_size):
xx = np.random.choice(axis, size=2)
xmin = np.min(xx)
xmax = np.max(xx)
xmin = np.clip(xmin, 0, max_size - 1)
xmax = np.clip(xmax, 0, max_size - 1)
return xmin, xmax
def region_wise_random_select(self, regions, max_size):
selected_index = list(np.random.choice(len(regions), 2))
selected_values = []
for index in selected_index:
axis = regions[index]
xx = int(np.random.choice(axis, size=1))
selected_values.append(xx)
xmin = min(selected_values)
xmax = max(selected_values)
return xmin, xmax
def crop_area(self, im, text_polys):
h, w = im.shape[:2]
h_array = np.zeros(h, dtype=np.int32)
w_array = np.zeros(w, dtype=np.int32)
for points in text_polys:
points = np.round(points, decimals=0).astype(np.int32)
minx = np.min(points[:, 0])
maxx = np.max(points[:, 0])
w_array[minx:maxx] = 1
miny = np.min(points[:, 1])
maxy = np.max(points[:, 1])
h_array[miny:maxy] = 1
# ensure the cropped area not across a text
h_axis = np.where(h_array == 0)[0]
w_axis = np.where(w_array == 0)[0]
if len(h_axis) == 0 or len(w_axis) == 0:
return 0, 0, w, h
h_regions = self.split_regions(h_axis)
w_regions = self.split_regions(w_axis)
for i in range(self.max_tries):
if len(w_regions) > 1:
xmin, xmax = self.region_wise_random_select(w_regions, w)
else:
xmin, xmax = self.random_select(w_axis, w)
if len(h_regions) > 1:
ymin, ymax = self.region_wise_random_select(h_regions, h)
else:
ymin, ymax = self.random_select(h_axis, h)
if (
xmax - xmin < self.min_crop_side_ratio * w
or ymax - ymin < self.min_crop_side_ratio * h
):
# area too small
continue
num_poly_in_rect = 0
for poly in text_polys:
if not self.is_poly_outside_rect(
poly, xmin, ymin, xmax - xmin, ymax - ymin
):
num_poly_in_rect += 1
break
if num_poly_in_rect > 0:
return xmin, ymin, xmax - xmin, ymax - ymin
return 0, 0, w, h
class PSERandomCrop:
def __init__(self, size):
self.size = size
def __call__(self, data):
imgs = data["imgs"]
h, w = imgs[0].shape[0:2]
th, tw = self.size
if w == tw and h == th:
return imgs
# label中存在文本实例并且按照概率进行裁剪使用threshold_label_map控制
if np.max(imgs[2]) > 0 and random.random() > 3 / 8:
# 文本实例的左上角点
tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size
tl[tl < 0] = 0
# 文本实例的右下角点
br = np.max(np.where(imgs[2] > 0), axis=1) - self.size
br[br < 0] = 0
# 保证选到右下角点时有足够的距离进行crop
br[0] = min(br[0], h - th)
br[1] = min(br[1], w - tw)
for _ in range(50000):
i = random.randint(tl[0], br[0])
j = random.randint(tl[1], br[1])
# 保证shrink_label_map有文本
if imgs[1][i : i + th, j : j + tw].sum() <= 0:
continue
else:
break
else:
i = random.randint(0, h - th)
j = random.randint(0, w - tw)
# return i, j, th, tw
for idx in range(len(imgs)):
if len(imgs[idx].shape) == 3:
imgs[idx] = imgs[idx][i : i + th, j : j + tw, :]
else:
imgs[idx] = imgs[idx][i : i + th, j : j + tw]
data["imgs"] = imgs
return data

View File

@ -0,0 +1,21 @@
name: dbnet
channels:
- conda-forge
- defaults
dependencies:
- anyconfig==0.9.10
- future==0.18.2
- imgaug==0.4.0
- matplotlib==3.1.2
- numpy==1.17.4
- opencv
- pyclipper
- PyYAML==5.2
- scikit-image==0.16.2
- Shapely==1.6.4
- tensorboard=2
- tqdm==4.40.1
- ipython
- pip
- pip:
- polygon3

View File

@ -0,0 +1 @@
CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py --model_path ''

View File

@ -0,0 +1,17 @@
#Only use if your file names of the images and txts are identical
rm ./datasets/train_img.txt
rm ./datasets/train_gt.txt
rm ./datasets/test_img.txt
rm ./datasets/test_gt.txt
rm ./datasets/train.txt
rm ./datasets/test.txt
ls ./datasets/train/img/*.jpg > ./datasets/train_img.txt
ls ./datasets/train/gt/*.txt > ./datasets/train_gt.txt
ls ./datasets/test/img/*.jpg > ./datasets/test_img.txt
ls ./datasets/test/gt/*.txt > ./datasets/test_gt.txt
paste ./datasets/train_img.txt ./datasets/train_gt.txt > ./datasets/train.txt
paste ./datasets/test_img.txt ./datasets/test_gt.txt > ./datasets/test.txt
rm ./datasets/train_img.txt
rm ./datasets/train_gt.txt
rm ./datasets/test_img.txt
rm ./datasets/test_gt.txt

View File

@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:55
# @Author : zhoujun
import copy
from .model import Model
from .losses import build_loss
__all__ = ["build_loss", "build_model"]
support_model = ["Model"]
def build_model(config):
"""
get architecture model class
"""
copy_config = copy.deepcopy(config)
arch_type = copy_config.pop("type")
assert (
arch_type in support_model
), f"{arch_type} is not developed yet!, only {support_model} are support now"
arch_model = eval(arch_type)(copy_config)
return arch_model

View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:54
# @Author : zhoujun
from .resnet import *
__all__ = ["build_backbone"]
support_backbone = [
"resnet18",
"deformable_resnet18",
"deformable_resnet50",
"resnet50",
"resnet34",
"resnet101",
"resnet152",
]
def build_backbone(backbone_name, **kwargs):
assert (
backbone_name in support_backbone
), f"all support backbone is {support_backbone}"
backbone = eval(backbone_name)(**kwargs)
return backbone

View File

@ -0,0 +1,366 @@
import math
import paddle
from paddle import nn
BatchNorm2d = nn.BatchNorm2D
__all__ = [
"ResNet",
"resnet18",
"resnet34",
"resnet50",
"resnet101",
"deformable_resnet18",
"deformable_resnet50",
"resnet152",
]
model_urls = {
"resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth",
"resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth",
"resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth",
"resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth",
"resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth",
}
def constant_init(module, constant, bias=0):
module.weight = paddle.create_parameter(
shape=module.weight.shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(constant),
)
if hasattr(module, "bias"):
module.bias = paddle.create_parameter(
shape=module.bias.shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(bias),
)
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2D(
in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias_attr=False
)
class BasicBlock(nn.Layer):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
super(BasicBlock, self).__init__()
self.with_dcn = dcn is not None
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = BatchNorm2d(planes, momentum=0.1)
self.relu = nn.ReLU()
self.with_modulated_dcn = False
if not self.with_dcn:
self.conv2 = nn.Conv2D(
planes, planes, kernel_size=3, padding=1, bias_attr=False
)
else:
from paddle.vision.ops import DeformConv2D
deformable_groups = dcn.get("deformable_groups", 1)
offset_channels = 18
self.conv2_offset = nn.Conv2D(
planes, deformable_groups * offset_channels, kernel_size=3, padding=1
)
self.conv2 = DeformConv2D(
planes, planes, kernel_size=3, padding=1, bias_attr=False
)
self.bn2 = BatchNorm2d(planes, momentum=0.1)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
# out = self.conv2(out)
if not self.with_dcn:
out = self.conv2(out)
else:
offset = self.conv2_offset(out)
out = self.conv2(out, offset)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Layer):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
super(Bottleneck, self).__init__()
self.with_dcn = dcn is not None
self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
self.bn1 = BatchNorm2d(planes, momentum=0.1)
self.with_modulated_dcn = False
if not self.with_dcn:
self.conv2 = nn.Conv2D(
planes, planes, kernel_size=3, stride=stride, padding=1, bias_attr=False
)
else:
deformable_groups = dcn.get("deformable_groups", 1)
from paddle.vision.ops import DeformConv2D
offset_channels = 18
self.conv2_offset = nn.Conv2D(
planes,
deformable_groups * offset_channels,
stride=stride,
kernel_size=3,
padding=1,
)
self.conv2 = DeformConv2D(
planes, planes, kernel_size=3, padding=1, stride=stride, bias_attr=False
)
self.bn2 = BatchNorm2d(planes, momentum=0.1)
self.conv3 = nn.Conv2D(planes, planes * 4, kernel_size=1, bias_attr=False)
self.bn3 = BatchNorm2d(planes * 4, momentum=0.1)
self.relu = nn.ReLU()
self.downsample = downsample
self.stride = stride
self.dcn = dcn
self.with_dcn = dcn is not None
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
# out = self.conv2(out)
if not self.with_dcn:
out = self.conv2(out)
else:
offset = self.conv2_offset(out)
out = self.conv2(out, offset)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Layer):
def __init__(self, block, layers, in_channels=3, dcn=None):
self.dcn = dcn
self.inplanes = 64
super(ResNet, self).__init__()
self.out_channels = []
self.conv1 = nn.Conv2D(
in_channels, 64, kernel_size=7, stride=2, padding=3, bias_attr=False
)
self.bn1 = BatchNorm2d(64, momentum=0.1)
self.relu = nn.ReLU()
self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dcn=dcn)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dcn=dcn)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dcn=dcn)
if self.dcn is not None:
for m in self.modules():
if isinstance(m, Bottleneck) or isinstance(m, BasicBlock):
if hasattr(m, "conv2_offset"):
constant_init(m.conv2_offset, 0)
def _make_layer(self, block, planes, blocks, stride=1, dcn=None):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2D(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias_attr=False,
),
BatchNorm2d(planes * block.expansion, momentum=0.1),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample, dcn=dcn))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes, dcn=dcn))
self.out_channels.append(planes * block.expansion)
return nn.Sequential(*layers)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x2 = self.layer1(x)
x3 = self.layer2(x2)
x4 = self.layer3(x3)
x5 = self.layer4(x4)
return x2, x3, x4, x5
def load_torch_params(paddle_model, torch_patams):
paddle_params = paddle_model.state_dict()
fc_names = ["classifier"]
for key, torch_value in torch_patams.items():
if "num_batches_tracked" in key:
continue
key = (
key.replace("running_var", "_variance")
.replace("running_mean", "_mean")
.replace("module.", "")
)
torch_value = torch_value.detach().cpu().numpy()
if key in paddle_params:
flag = [i in key for i in fc_names]
if any(flag) and "weight" in key: # ignore bias
new_shape = [1, 0] + list(range(2, torch_value.ndim))
print(
f"name: {key}, ori shape: {torch_value.shape}, new shape: {torch_value.transpose(new_shape).shape}"
)
torch_value = torch_value.transpose(new_shape)
paddle_params[key] = torch_value
else:
print(f"{key} not in paddle")
paddle_model.set_state_dict(paddle_params)
def load_models(model, model_name):
import torch.utils.model_zoo as model_zoo
torch_patams = model_zoo.load_url(model_urls[model_name])
load_torch_params(model, torch_patams)
def resnet18(pretrained=True, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
print("load from imagenet")
load_models(model, "resnet18")
return model
def deformable_resnet18(pretrained=True, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], dcn=dict(deformable_groups=1), **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
print("load from imagenet")
model.load_state_dict(model_zoo.load_url(model_urls["resnet18"]), strict=False)
return model
def resnet34(pretrained=True, **kwargs):
"""Constructs a ResNet-34 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet34"]), strict=False)
return model
def resnet50(pretrained=True, **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
load_models(model, "resnet50")
return model
def deformable_resnet50(pretrained=True, **kwargs):
"""Constructs a ResNet-50 model with deformable conv.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], dcn=dict(deformable_groups=1), **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet50"]), strict=False)
return model
def resnet101(pretrained=True, **kwargs):
"""Constructs a ResNet-101 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet101"]), strict=False)
return model
def resnet152(pretrained=True, **kwargs):
"""Constructs a ResNet-152 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
if pretrained:
assert (
kwargs.get("in_channels", 3) == 3
), "in_channels must be 3 when pretrained is True"
model.load_state_dict(model_zoo.load_url(model_urls["resnet152"]), strict=False)
return model
if __name__ == "__main__":
x = paddle.zeros([2, 3, 640, 640])
net = resnet50(pretrained=True)
y = net(x)
for u in y:
print(u.shape)
print(net.out_channels)

View File

@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/6 11:19
# @Author : zhoujun
from paddle import nn
class ConvBnRelu(nn.Layer):
def __init__(
self,
in_channels,
out_channels,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=True,
padding_mode="zeros",
inplace=True,
):
super().__init__()
self.conv = nn.Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias_attr=bias,
padding_mode=padding_mode,
)
self.bn = nn.BatchNorm2D(out_channels)
self.relu = nn.ReLU()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.relu(x)
return x

View File

@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 14:54
# @Author : zhoujun
import paddle
from paddle import nn, ParamAttr
class DBHead(nn.Layer):
def __init__(self, in_channels, out_channels, k=50):
super().__init__()
self.k = k
self.binarize = nn.Sequential(
nn.Conv2D(
in_channels,
in_channels // 4,
3,
padding=1,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
nn.BatchNorm2D(
in_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
nn.Conv2DTranspose(
in_channels // 4,
in_channels // 4,
2,
2,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
nn.BatchNorm2D(
in_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
nn.Conv2DTranspose(
in_channels // 4, 1, 2, 2, weight_attr=nn.initializer.KaimingNormal()
),
nn.Sigmoid(),
)
self.thresh = self._init_thresh(in_channels)
def forward(self, x):
shrink_maps = self.binarize(x)
threshold_maps = self.thresh(x)
if self.training:
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = paddle.concat((shrink_maps, threshold_maps, binary_maps), axis=1)
else:
y = paddle.concat((shrink_maps, threshold_maps), axis=1)
return y
def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
in_channels = inner_channels
if serial:
in_channels += 1
self.thresh = nn.Sequential(
nn.Conv2D(
in_channels,
inner_channels // 4,
3,
padding=1,
bias_attr=bias,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
nn.BatchNorm2D(
inner_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
self._init_upsample(
inner_channels // 4, inner_channels // 4, smooth=smooth, bias=bias
),
nn.BatchNorm2D(
inner_channels // 4,
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
),
nn.ReLU(),
self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
nn.Sigmoid(),
)
return self.thresh
def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
if smooth:
inter_out_channels = out_channels
if out_channels == 1:
inter_out_channels = in_channels
module_list = [
nn.Upsample(scale_factor=2, mode="nearest"),
nn.Conv2D(
in_channels,
inter_out_channels,
3,
1,
1,
bias_attr=bias,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
),
]
if out_channels == 1:
module_list.append(
nn.Conv2D(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=1,
bias_attr=True,
weight_attr=ParamAttr(
initializer=nn.initializer.KaimingNormal()
),
)
)
return nn.Sequential(module_list)
else:
return nn.Conv2DTranspose(
in_channels,
out_channels,
2,
2,
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
)
def step_function(self, x, y):
return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))

View File

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
# @Time : 2020/6/5 11:35
# @Author : zhoujun
from .DBHead import DBHead
__all__ = ["build_head"]
support_head = ["DBHead"]
def build_head(head_name, **kwargs):
assert head_name in support_head, f"all support head is {support_head}"
head = eval(head_name)(**kwargs)
return head

View File

@ -0,0 +1,50 @@
import paddle
from models.losses.basic_loss import BalanceCrossEntropyLoss, MaskL1Loss, DiceLoss
class DBLoss(paddle.nn.Layer):
def __init__(self, alpha=1.0, beta=10, ohem_ratio=3, reduction="mean", eps=1e-06):
"""
Implement PSE Loss.
:param alpha: binary_map loss 前面的系数
:param beta: threshold_map loss 前面的系数
:param ohem_ratio: OHEM的比例
:param reduction: 'mean' or 'sum' batch里的loss 算均值或求和
"""
super().__init__()
assert reduction in ["mean", "sum"], " reduction must in ['mean','sum']"
self.alpha = alpha
self.beta = beta
self.bce_loss = BalanceCrossEntropyLoss(negative_ratio=ohem_ratio)
self.dice_loss = DiceLoss(eps=eps)
self.l1_loss = MaskL1Loss(eps=eps)
self.ohem_ratio = ohem_ratio
self.reduction = reduction
def forward(self, pred, batch):
shrink_maps = pred[:, 0, :, :]
threshold_maps = pred[:, 1, :, :]
binary_maps = pred[:, 2, :, :]
loss_shrink_maps = self.bce_loss(
shrink_maps, batch["shrink_map"], batch["shrink_mask"]
)
loss_threshold_maps = self.l1_loss(
threshold_maps, batch["threshold_map"], batch["threshold_mask"]
)
metrics = dict(
loss_shrink_maps=loss_shrink_maps, loss_threshold_maps=loss_threshold_maps
)
if pred.shape[1] > 2:
loss_binary_maps = self.dice_loss(
binary_maps, batch["shrink_map"], batch["shrink_mask"]
)
metrics["loss_binary_maps"] = loss_binary_maps
loss_all = (
self.alpha * loss_shrink_maps
+ self.beta * loss_threshold_maps
+ loss_binary_maps
)
metrics["loss"] = loss_all
else:
metrics["loss"] = loss_shrink_maps
return metrics

View File

@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
# @Time : 2020/6/5 11:36
# @Author : zhoujun
import copy
from .DB_loss import DBLoss
__all__ = ["build_loss"]
support_loss = ["DBLoss"]
def build_loss(config):
copy_config = copy.deepcopy(config)
loss_type = copy_config.pop("type")
assert loss_type in support_loss, f"all support loss is {support_loss}"
criterion = eval(loss_type)(**copy_config)
return criterion

View File

@ -0,0 +1,101 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/4 14:39
# @Author : zhoujun
import paddle
import paddle.nn as nn
class BalanceCrossEntropyLoss(nn.Layer):
"""
Balanced cross entropy loss.
Shape:
- Input: :math:`(N, 1, H, W)`
- GT: :math:`(N, 1, H, W)`, same shape as the input
- Mask: :math:`(N, H, W)`, same spatial shape as the input
- Output: scalar.
"""
def __init__(self, negative_ratio=3.0, eps=1e-6):
super(BalanceCrossEntropyLoss, self).__init__()
self.negative_ratio = negative_ratio
self.eps = eps
def forward(
self,
pred: paddle.Tensor,
gt: paddle.Tensor,
mask: paddle.Tensor,
return_origin=False,
):
"""
Args:
pred: shape :math:`(N, 1, H, W)`, the prediction of network
gt: shape :math:`(N, 1, H, W)`, the target
mask: shape :math:`(N, H, W)`, the mask indicates positive regions
"""
positive = gt * mask
negative = (1 - gt) * mask
positive_count = int(positive.sum())
negative_count = min(
int(negative.sum()), int(positive_count * self.negative_ratio)
)
loss = nn.functional.binary_cross_entropy(pred, gt, reduction="none")
positive_loss = loss * positive
negative_loss = loss * negative
negative_loss, _ = negative_loss.reshape([-1]).topk(negative_count)
balance_loss = (positive_loss.sum() + negative_loss.sum()) / (
positive_count + negative_count + self.eps
)
if return_origin:
return balance_loss, loss
return balance_loss
class DiceLoss(nn.Layer):
"""
Loss function from https://arxiv.org/abs/1707.03237,
where iou computation is introduced heatmap manner to measure the
diversity between tow heatmaps.
"""
def __init__(self, eps=1e-6):
super(DiceLoss, self).__init__()
self.eps = eps
def forward(self, pred: paddle.Tensor, gt, mask, weights=None):
"""
pred: one or two heatmaps of shape (N, 1, H, W),
the losses of tow heatmaps are added together.
gt: (N, 1, H, W)
mask: (N, H, W)
"""
return self._compute(pred, gt, mask, weights)
def _compute(self, pred, gt, mask, weights):
if len(pred.shape) == 4:
pred = pred[:, 0, :, :]
gt = gt[:, 0, :, :]
assert pred.shape == gt.shape
assert pred.shape == mask.shape
if weights is not None:
assert weights.shape == mask.shape
mask = weights * mask
intersection = (pred * gt * mask).sum()
union = (pred * mask).sum() + (gt * mask).sum() + self.eps
loss = 1 - 2.0 * intersection / union
assert loss <= 1
return loss
class MaskL1Loss(nn.Layer):
def __init__(self, eps=1e-6):
super(MaskL1Loss, self).__init__()
self.eps = eps
def forward(self, pred: paddle.Tensor, gt, mask):
loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps)
return loss

View File

@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:57
# @Author : zhoujun
from addict import Dict
from paddle import nn
import paddle.nn.functional as F
from models.backbone import build_backbone
from models.neck import build_neck
from models.head import build_head
class Model(nn.Layer):
def __init__(self, model_config: dict):
"""
PANnet
:param model_config: 模型配置
"""
super().__init__()
model_config = Dict(model_config)
backbone_type = model_config.backbone.pop("type")
neck_type = model_config.neck.pop("type")
head_type = model_config.head.pop("type")
self.backbone = build_backbone(backbone_type, **model_config.backbone)
self.neck = build_neck(
neck_type, in_channels=self.backbone.out_channels, **model_config.neck
)
self.head = build_head(
head_type, in_channels=self.neck.out_channels, **model_config.head
)
self.name = f"{backbone_type}_{neck_type}_{head_type}"
def forward(self, x):
_, _, H, W = x.shape
backbone_out = self.backbone(x)
neck_out = self.neck(backbone_out)
y = self.head(neck_out)
y = F.interpolate(y, size=(H, W), mode="bilinear", align_corners=True)
return y

View File

@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
# @Time : 2019/9/13 10:29
# @Author : zhoujun
import paddle
import paddle.nn.functional as F
from paddle import nn
from models.basic import ConvBnRelu
class FPN(nn.Layer):
def __init__(self, in_channels, inner_channels=256, **kwargs):
"""
:param in_channels: 基础网络输出的维度
:param kwargs:
"""
super().__init__()
inplace = True
self.conv_out = inner_channels
inner_channels = inner_channels // 4
# reduce layers
self.reduce_conv_c2 = ConvBnRelu(
in_channels[0], inner_channels, kernel_size=1, inplace=inplace
)
self.reduce_conv_c3 = ConvBnRelu(
in_channels[1], inner_channels, kernel_size=1, inplace=inplace
)
self.reduce_conv_c4 = ConvBnRelu(
in_channels[2], inner_channels, kernel_size=1, inplace=inplace
)
self.reduce_conv_c5 = ConvBnRelu(
in_channels[3], inner_channels, kernel_size=1, inplace=inplace
)
# Smooth layers
self.smooth_p4 = ConvBnRelu(
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
)
self.smooth_p3 = ConvBnRelu(
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
)
self.smooth_p2 = ConvBnRelu(
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
)
self.conv = nn.Sequential(
nn.Conv2D(self.conv_out, self.conv_out, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2D(self.conv_out),
nn.ReLU(),
)
self.out_channels = self.conv_out
def forward(self, x):
c2, c3, c4, c5 = x
# Top-down
p5 = self.reduce_conv_c5(c5)
p4 = self._upsample_add(p5, self.reduce_conv_c4(c4))
p4 = self.smooth_p4(p4)
p3 = self._upsample_add(p4, self.reduce_conv_c3(c3))
p3 = self.smooth_p3(p3)
p2 = self._upsample_add(p3, self.reduce_conv_c2(c2))
p2 = self.smooth_p2(p2)
x = self._upsample_cat(p2, p3, p4, p5)
x = self.conv(x)
return x
def _upsample_add(self, x, y):
return F.interpolate(x, size=y.shape[2:]) + y
def _upsample_cat(self, p2, p3, p4, p5):
h, w = p2.shape[2:]
p3 = F.interpolate(p3, size=(h, w))
p4 = F.interpolate(p4, size=(h, w))
p5 = F.interpolate(p5, size=(h, w))
return paddle.concat([p2, p3, p4, p5], axis=1)

View File

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
# @Time : 2020/6/5 11:34
# @Author : zhoujun
from .FPN import FPN
__all__ = ["build_neck"]
support_neck = ["FPN"]
def build_neck(neck_name, **kwargs):
assert neck_name in support_neck, f"all support neck is {support_neck}"
neck = eval(neck_name)(**kwargs)
return neck

View File

@ -0,0 +1,2 @@
# export NCCL_P2P_DISABLE=1
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m paddle.distributed.launch tools/train.py --config_file "config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml"

View File

@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/5 15:17
# @Author : zhoujun
from .seg_detector_representer import SegDetectorRepresenter
def get_post_processing(config):
try:
cls = eval(config["type"])(**config["args"])
return cls
except:
return None

View File

@ -0,0 +1,191 @@
import cv2
import numpy as np
import pyclipper
import paddle
from shapely.geometry import Polygon
class SegDetectorRepresenter:
def __init__(
self, thresh=0.3, box_thresh=0.7, max_candidates=1000, unclip_ratio=1.5
):
self.min_size = 3
self.thresh = thresh
self.box_thresh = box_thresh
self.max_candidates = max_candidates
self.unclip_ratio = unclip_ratio
def __call__(self, batch, pred, is_output_polygon=False):
"""
batch: (image, polygons, ignore_tags
batch: a dict produced by dataloaders.
image: tensor of shape (N, C, H, W).
polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions.
ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not.
shape: the original shape of images.
filename: the original filenames of images.
pred:
binary: text region segmentation map, with shape (N, H, W)
thresh: [if exists] thresh hold prediction with shape (N, H, W)
thresh_binary: [if exists] binarized with threshold, (N, H, W)
"""
if isinstance(pred, paddle.Tensor):
pred = pred.numpy()
pred = pred[:, 0, :, :]
segmentation = self.binarize(pred)
boxes_batch = []
scores_batch = []
for batch_index in range(pred.shape[0]):
height, width = batch["shape"][batch_index]
if is_output_polygon:
boxes, scores = self.polygons_from_bitmap(
pred[batch_index], segmentation[batch_index], width, height
)
else:
boxes, scores = self.boxes_from_bitmap(
pred[batch_index], segmentation[batch_index], width, height
)
boxes_batch.append(boxes)
scores_batch.append(scores)
return boxes_batch, scores_batch
def binarize(self, pred):
return pred > self.thresh
def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
"""
_bitmap: single map with shape (H, W),
whose values are binarized as {0, 1}
"""
assert len(_bitmap.shape) == 2
bitmap = _bitmap # The first channel
height, width = bitmap.shape
boxes = []
scores = []
contours, _ = cv2.findContours(
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
)
for contour in contours[: self.max_candidates]:
epsilon = 0.005 * cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(contour, epsilon, True)
points = approx.reshape((-1, 2))
if points.shape[0] < 4:
continue
# _, sside = self.get_mini_boxes(contour)
# if sside < self.min_size:
# continue
score = self.box_score_fast(pred, contour.squeeze(1))
if self.box_thresh > score:
continue
if points.shape[0] > 2:
box = self.unclip(points, unclip_ratio=self.unclip_ratio)
if len(box) > 1:
continue
else:
continue
box = box.reshape(-1, 2)
_, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
if sside < self.min_size + 2:
continue
if not isinstance(dest_width, int):
dest_width = dest_width.item()
dest_height = dest_height.item()
box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / height * dest_height), 0, dest_height
)
boxes.append(box)
scores.append(score)
return boxes, scores
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
"""
_bitmap: single map with shape (H, W),
whose values are binarized as {0, 1}
"""
assert len(_bitmap.shape) == 2
bitmap = _bitmap # The first channel
height, width = bitmap.shape
contours, _ = cv2.findContours(
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
)
num_contours = min(len(contours), self.max_candidates)
boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
scores = np.zeros((num_contours,), dtype=np.float32)
for index in range(num_contours):
contour = contours[index].squeeze(1)
points, sside = self.get_mini_boxes(contour)
if sside < self.min_size:
continue
points = np.array(points)
score = self.box_score_fast(pred, contour)
if self.box_thresh > score:
continue
box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2)
box, sside = self.get_mini_boxes(box)
if sside < self.min_size + 2:
continue
box = np.array(box)
if not isinstance(dest_width, int):
dest_width = dest_width.item()
dest_height = dest_height.item()
box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / height * dest_height), 0, dest_height
)
boxes[index, :, :] = box.astype(np.int16)
scores[index] = score
return boxes, scores
def unclip(self, box, unclip_ratio=1.5):
poly = Polygon(box)
distance = poly.area * unclip_ratio / poly.length
offset = pyclipper.PyclipperOffset()
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
expanded = np.array(offset.Execute(distance))
return expanded
def get_mini_boxes(self, contour):
bounding_box = cv2.minAreaRect(contour)
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
if points[1][1] > points[0][1]:
index_1 = 0
index_4 = 1
else:
index_1 = 1
index_4 = 0
if points[3][1] > points[2][1]:
index_2 = 2
index_3 = 3
else:
index_2 = 3
index_3 = 2
box = [points[index_1], points[index_2], points[index_3], points[index_4]]
return box, min(bounding_box[1])
def box_score_fast(self, bitmap, _box):
h, w = bitmap.shape[:2]
box = _box.copy()
xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
box[:, 0] = box[:, 0] - xmin
box[:, 1] = box[:, 1] - ymin
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]

View File

@ -0,0 +1 @@
CUDA_VISIBLE_DEVICES=0 python tools/predict.py --model_path model_best.pth --input_folder ./input --output_folder ./output --thre 0.7 --polygon --show --save_result

View File

@ -0,0 +1,14 @@
anyconfig
future
imgaug
matplotlib
numpy
opencv-python
Polygon3
pyclipper
PyYAML
scikit-image
Shapely
tqdm
addict
visualdl

View File

@ -0,0 +1 @@
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py --config_file "config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml"

View File

@ -0,0 +1,8 @@
Place the images that you want to detect here. You better named them as such:
img_10.jpg
img_11.jpg
img_{img_id}.jpg
For predicting single images, you can change the `img_path` in the `/tools/predict.py` to your image number.
The result will be saved in the output_folder(default is test/output) you give in predict.sh

View File

@ -0,0 +1,287 @@
#!/bin/bash
source test_tipc/common_func.sh
# run benchmark sh
# Usage:
# bash run_benchmark_train.sh config.txt params
# or
# bash run_benchmark_train.sh config.txt
function func_parser_params(){
strs=$1
IFS="="
array=(${strs})
tmp=${array[1]}
echo ${tmp}
}
function set_dynamic_epoch(){
string=$1
num=$2
_str=${string:1:6}
IFS="C"
arr=(${_str})
M=${arr[0]}
P=${arr[1]}
ep=`expr $num \* $M \* $P`
echo $ep
}
function func_sed_params(){
filename=$1
line=$2
param_value=$3
params=`sed -n "${line}p" $filename`
IFS=":"
array=(${params})
key=${array[0]}
value=${array[1]}
new_params="${key}:${param_value}"
IFS=";"
cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
eval $cmd
}
function set_gpu_id(){
string=$1
_str=${string:1:6}
IFS="C"
arr=(${_str})
M=${arr[0]}
P=${arr[1]}
gn=`expr $P - 1`
gpu_num=`expr $gn / $M`
seq=`seq -s "," 0 $gpu_num`
echo $seq
}
function get_repo_name(){
IFS=";"
cur_dir=$(pwd)
IFS="/"
arr=(${cur_dir})
echo ${arr[-1]}
}
FILENAME=$1
# copy FILENAME as new
new_filename="./test_tipc/benchmark_train.txt"
cmd=`yes|cp $FILENAME $new_filename`
FILENAME=$new_filename
# MODE must be one of ['benchmark_train']
MODE=$2
PARAMS=$3
to_static=""
# parse "to_static" options and modify trainer into "to_static_trainer"
if [[ $PARAMS =~ "dynamicTostatic" ]] ;then
to_static="d2sT_"
sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME
# clear PARAM contents
if [ $PARAMS = "to_static" ] ;then
PARAMS=""
fi
fi
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_fp32_DP_N1C8
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamicTostatic_bs8_fp32_DP_N1C8
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1
IFS=$'\n'
# parser params from train_benchmark.txt
dataline=`cat $FILENAME`
# parser params
IFS=$'\n'
lines=(${dataline})
model_name=$(func_parser_value "${lines[1]}")
python_name=$(func_parser_value "${lines[2]}")
# set env
python=${python_name}
export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
export frame_version=${str_tmp%%.post*}
export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)
# 获取benchmark_params所在的行数
line_num=`grep -n -w "train_benchmark_params" $FILENAME | cut -d ":" -f 1`
# for train log parser
batch_size=$(func_parser_value "${lines[line_num]}")
line_num=`expr $line_num + 1`
fp_items=$(func_parser_value "${lines[line_num]}")
line_num=`expr $line_num + 1`
epoch=$(func_parser_value "${lines[line_num]}")
line_num=`expr $line_num + 1`
profile_option_key=$(func_parser_key "${lines[line_num]}")
profile_option_params=$(func_parser_value "${lines[line_num]}")
profile_option="${profile_option_key}:${profile_option_params}"
line_num=`expr $line_num + 1`
flags_value=$(func_parser_value "${lines[line_num]}")
# set flags
IFS=";"
flags_list=(${flags_value})
for _flag in ${flags_list[*]}; do
cmd="export ${_flag}"
eval $cmd
done
# set log_name
repo_name=$(get_repo_name )
SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log
mkdir -p "${SAVE_LOG}/benchmark_log/"
status_log="${SAVE_LOG}/benchmark_log/results.log"
# The number of lines in which train params can be replaced.
line_python=3
line_gpuid=4
line_precision=6
line_epoch=7
line_batchsize=9
line_profile=13
line_eval_py=24
line_export_py=30
func_sed_params "$FILENAME" "${line_eval_py}" "null"
func_sed_params "$FILENAME" "${line_export_py}" "null"
func_sed_params "$FILENAME" "${line_python}" "$python"
# if params
if [ ! -n "$PARAMS" ] ;then
# PARAMS input is not a word.
IFS="|"
batch_size_list=(${batch_size})
fp_items_list=(${fp_items})
device_num_list=(N1C4)
run_mode="DP"
elif [[ ${PARAMS} = "dynamicTostatic" ]];then
IFS="|"
model_type=$PARAMS
batch_size_list=(${batch_size})
fp_items_list=(${fp_items})
device_num_list=(N1C4)
run_mode="DP"
else
# parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
IFS="_"
params_list=(${PARAMS})
model_type=${params_list[0]}
batch_size=${params_list[1]}
batch_size=`echo ${batch_size} | tr -cd "[0-9]" `
precision=${params_list[2]}
run_mode=${params_list[3]}
device_num=${params_list[4]}
IFS=";"
if [ ${precision} = "fp16" ];then
precision="amp"
fi
epoch=$(set_dynamic_epoch $device_num $epoch)
fp_items_list=($precision)
batch_size_list=($batch_size)
device_num_list=($device_num)
fi
IFS="|"
for batch_size in ${batch_size_list[*]}; do
for train_precision in ${fp_items_list[*]}; do
for device_num in ${device_num_list[*]}; do
# sed batchsize and precision
if [ ${train_precision} = "amp" ];then
precision="fp16"
else
precision="fp32"
fi
func_sed_params "$FILENAME" "${line_precision}" "$train_precision"
func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size"
func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch"
gpu_id=$(set_gpu_id $device_num)
if [ ${#gpu_id} -le 1 ];then
log_path="$SAVE_LOG/profiling_log"
mkdir -p $log_path
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling"
func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id
# set profile_option params
tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
# run test_train_inference_python.sh
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
echo $cmd
eval $cmd
eval "cat ${log_path}/${log_name}"
# without profile
log_path="$SAVE_LOG/train_log"
speed_log_path="$SAVE_LOG/index"
mkdir -p $log_path
mkdir -p $speed_log_path
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
echo $cmd
job_bt=`date '+%Y%m%d%H%M%S'`
eval $cmd
job_et=`date '+%Y%m%d%H%M%S'`
export model_run_time=$((${job_et}-${job_bt}))
eval "cat ${log_path}/${log_name}"
# parser log
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
--speed_log_file '${speed_log_path}/${speed_log_name}' \
--model_name ${_model_name} \
--base_batch_size ${batch_size} \
--run_mode ${run_mode} \
--fp_item ${precision} \
--keyword ips: \
--skip_steps 2 \
--device_num ${device_num} \
--speed_unit samples/s \
--convergence_key loss: "
echo $cmd
eval $cmd
last_status=${PIPESTATUS[0]}
status_check $last_status "${cmd}" "${status_log}"
else
IFS=";"
unset_env=`unset CUDA_VISIBLE_DEVICES`
log_path="$SAVE_LOG/train_log"
speed_log_path="$SAVE_LOG/index"
mkdir -p $log_path
mkdir -p $speed_log_path
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id
func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
echo $cmd
job_bt=`date '+%Y%m%d%H%M%S'`
eval $cmd
job_et=`date '+%Y%m%d%H%M%S'`
export model_run_time=$((${job_et}-${job_bt}))
eval "cat ${log_path}/${log_name}"
# parser log
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
--speed_log_file '${speed_log_path}/${speed_log_name}' \
--model_name ${_model_name} \
--base_batch_size ${batch_size} \
--run_mode ${run_mode} \
--fp_item ${precision} \
--keyword ips: \
--skip_steps 2 \
--device_num ${device_num} \
--speed_unit images/s \
--convergence_key loss: "
echo $cmd
eval $cmd
last_status=${PIPESTATUS[0]}
status_check $last_status "${cmd}" "${status_log}"
fi
done
done
done

View File

@ -0,0 +1,67 @@
#!/bin/bash
function func_parser_key(){
strs=$1
IFS=":"
array=(${strs})
tmp=${array[0]}
echo ${tmp}
}
function func_parser_value(){
strs=$1
IFS=":"
array=(${strs})
tmp=${array[1]}
echo ${tmp}
}
function func_set_params(){
key=$1
value=$2
if [ ${key}x = "null"x ];then
echo " "
elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
echo " "
else
echo "${key}=${value}"
fi
}
function func_parser_params(){
strs=$1
MODE=$2
IFS=":"
array=(${strs})
key=${array[0]}
tmp=${array[1]}
IFS="|"
res=""
for _params in ${tmp[*]}; do
IFS="="
array=(${_params})
mode=${array[0]}
value=${array[1]}
if [[ ${mode} = ${MODE} ]]; then
IFS="|"
#echo $(func_set_params "${mode}" "${value}")
echo $value
break
fi
IFS="|"
done
echo ${res}
}
function status_check(){
last_status=$1 # the exit code
run_command=$2
run_log=$3
model_name=$4
log_path=$5
if [ $last_status -eq 0 ]; then
echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
else
echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
fi
}

View File

@ -0,0 +1,61 @@
===========================train_params===========================
model_name:det_res50_db
python:python
gpu_list:0|0,1
trainer.use_gpu:True|True
amp:null
trainer.epochs:lite_train_lite_infer=1|whole_train_whole_infer=300
trainer.output_dir:./output/
dataset.train.loader.batch_size:lite_train_lite_infer=8|whole_train_lite_infer=8
trainer.finetune_checkpoint:null
train_model_name:checkpoint/model_latest.pth
train_infer_img_dir:imgs/paper/db.jpg
null:null
##
trainer:norm_train
norm_train:tools/train.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.log_iter=1 trainer.enable_eval=False dataset.train.loader.shuffle=false arch.backbone.pretrained=False
quant_export:null
fpgm_export:null
distill_train:null
null:null
null:null
##
===========================eval_params===========================
eval:null
null:null
##
===========================infer_params===========================
trainer.output_dir:./output/
trainer.resume_checkpoint:
norm_export:tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o
quant_export:null
fpgm_export:null
distill_export:null
export1:null
export2:null
##
train_model:./inference/det_r50_vd_db_v2.0_train/best_accuracy
infer_export:tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o
infer_quant:False
inference:tools/infer.py
--use_gpu:True|False
--enable_mkldnn:False
--cpu_threads:6
--batch_size:1
--use_tensorrt:False
--precision:fp32
--model_dir:
--img_path:imgs/paper/db.jpg
--save_log_path:null
--benchmark:True
null:null
===========================infer_benchmark_params==========================
random_infer_input:[{float32,[3,640,640]}];[{float32,[3,960,960]}]
===========================train_benchmark_params==========================
batch_size:8
fp_items:fp32|fp16
epoch:2
--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
===========================to_static_train_benchmark_params===========================
to_static_train:trainer.to_static=true

View File

@ -0,0 +1,54 @@
#!/bin/bash
source test_tipc/common_func.sh
FILENAME=$1
# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',
# 'whole_infer', 'klquant_whole_infer',
# 'cpp_infer', 'serving_infer']
MODE=$2
dataline=$(cat ${FILENAME})
# parser params
IFS=$'\n'
lines=(${dataline})
# The training params
model_name=$(func_parser_value "${lines[1]}")
trainer_list=$(func_parser_value "${lines[14]}")
if [ ${MODE} = "lite_train_lite_infer" ];then
python_name_list=$(func_parser_value "${lines[2]}")
array=(${python_name_list})
python_name=${array[0]}
${python_name} -m pip install -r requirement.txt
if [[ ${model_name} =~ "det_res50_db" ]];then
wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/resnet50-19c8e357.pth -O /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth
# 下载数据集并解压
rm -rf datasets
wget -nc https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/benchmark_train/datasets.tar
tar xf datasets.tar
fi
elif [ ${MODE} = "benchmark_train" ];then
python_name_list=$(func_parser_value "${lines[2]}")
array=(${python_name_list})
python_name=${array[0]}
${python_name} -m pip install -r requirement.txt
if [[ ${model_name} =~ "det_res50_db" ]];then
wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/resnet50-19c8e357.pth -O /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth
# 下载数据集并解压
rm -rf datasets
wget -nc https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/benchmark_train/datasets.tar
tar xf datasets.tar
# expand gt.txt 2 times
# cd ./train_data/icdar2015/text_localization
# for i in `seq 2`;do cp train_icdar2015_label.txt dup$i.txt;done
# cat dup* > train_icdar2015_label.txt && rm -rf dup*
# cd ../../../
fi
fi

View File

@ -0,0 +1,343 @@
#!/bin/bash
source test_tipc/common_func.sh
FILENAME=$1
# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']
MODE=$2
dataline=$(awk 'NR>=1{print}' $FILENAME)
# parser params
IFS=$'\n'
lines=(${dataline})
# The training params
model_name=$(func_parser_value "${lines[1]}")
python=$(func_parser_value "${lines[2]}")
gpu_list=$(func_parser_value "${lines[3]}")
train_use_gpu_key=$(func_parser_key "${lines[4]}")
train_use_gpu_value=$(func_parser_value "${lines[4]}")
autocast_list=$(func_parser_value "${lines[5]}")
autocast_key=$(func_parser_key "${lines[5]}")
epoch_key=$(func_parser_key "${lines[6]}")
epoch_num=$(func_parser_params "${lines[6]}" "${MODE}")
save_model_key=$(func_parser_key "${lines[7]}")
train_batch_key=$(func_parser_key "${lines[8]}")
train_batch_value=$(func_parser_params "${lines[8]}" "${MODE}")
pretrain_model_key=$(func_parser_key "${lines[9]}")
pretrain_model_value=$(func_parser_value "${lines[9]}")
train_model_name=$(func_parser_value "${lines[10]}")
train_infer_img_dir=$(func_parser_value "${lines[11]}")
train_param_key1=$(func_parser_key "${lines[12]}")
train_param_value1=$(func_parser_value "${lines[12]}")
trainer_list=$(func_parser_value "${lines[14]}")
trainer_norm=$(func_parser_key "${lines[15]}")
norm_trainer=$(func_parser_value "${lines[15]}")
pact_key=$(func_parser_key "${lines[16]}")
pact_trainer=$(func_parser_value "${lines[16]}")
fpgm_key=$(func_parser_key "${lines[17]}")
fpgm_trainer=$(func_parser_value "${lines[17]}")
distill_key=$(func_parser_key "${lines[18]}")
distill_trainer=$(func_parser_value "${lines[18]}")
trainer_key1=$(func_parser_key "${lines[19]}")
trainer_value1=$(func_parser_value "${lines[19]}")
trainer_key2=$(func_parser_key "${lines[20]}")
trainer_value2=$(func_parser_value "${lines[20]}")
eval_py=$(func_parser_value "${lines[23]}")
eval_key1=$(func_parser_key "${lines[24]}")
eval_value1=$(func_parser_value "${lines[24]}")
save_infer_key=$(func_parser_key "${lines[27]}")
export_weight=$(func_parser_key "${lines[28]}")
norm_export=$(func_parser_value "${lines[29]}")
pact_export=$(func_parser_value "${lines[30]}")
fpgm_export=$(func_parser_value "${lines[31]}")
distill_export=$(func_parser_value "${lines[32]}")
export_key1=$(func_parser_key "${lines[33]}")
export_value1=$(func_parser_value "${lines[33]}")
export_key2=$(func_parser_key "${lines[34]}")
export_value2=$(func_parser_value "${lines[34]}")
inference_dir=$(func_parser_value "${lines[35]}")
# parser inference model
infer_model_dir_list=$(func_parser_value "${lines[36]}")
infer_export_list=$(func_parser_value "${lines[37]}")
infer_is_quant=$(func_parser_value "${lines[38]}")
# parser inference
inference_py=$(func_parser_value "${lines[39]}")
use_gpu_key=$(func_parser_key "${lines[40]}")
use_gpu_list=$(func_parser_value "${lines[40]}")
use_mkldnn_key=$(func_parser_key "${lines[41]}")
use_mkldnn_list=$(func_parser_value "${lines[41]}")
cpu_threads_key=$(func_parser_key "${lines[42]}")
cpu_threads_list=$(func_parser_value "${lines[42]}")
batch_size_key=$(func_parser_key "${lines[43]}")
batch_size_list=$(func_parser_value "${lines[43]}")
use_trt_key=$(func_parser_key "${lines[44]}")
use_trt_list=$(func_parser_value "${lines[44]}")
precision_key=$(func_parser_key "${lines[45]}")
precision_list=$(func_parser_value "${lines[45]}")
infer_model_key=$(func_parser_key "${lines[46]}")
image_dir_key=$(func_parser_key "${lines[47]}")
infer_img_dir=$(func_parser_value "${lines[47]}")
save_log_key=$(func_parser_key "${lines[48]}")
benchmark_key=$(func_parser_key "${lines[49]}")
benchmark_value=$(func_parser_value "${lines[49]}")
infer_key1=$(func_parser_key "${lines[50]}")
infer_value1=$(func_parser_value "${lines[50]}")
LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
mkdir -p ${LOG_PATH}
status_log="${LOG_PATH}/results_python.log"
line_num=`grep -n -w "to_static_train_benchmark_params" $FILENAME | cut -d ":" -f 1`
to_static_key=$(func_parser_key "${lines[line_num]}")
to_static_trainer=$(func_parser_value "${lines[line_num]}")
function func_inference(){
IFS='|'
_python=$1
_script=$2
_model_dir=$3
_log_path=$4
_img_dir=$5
_flag_quant=$6
_gpu=$7
# inference
for use_gpu in ${use_gpu_list[*]}; do
if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
for use_mkldnn in ${use_mkldnn_list[*]}; do
# if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
# continue
# fi
for threads in ${cpu_threads_list[*]}; do
for batch_size in ${batch_size_list[*]}; do
for precision in ${precision_list[*]}; do
if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then
continue
fi # skip when enable fp16 but disable mkldnn
if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then
continue
fi # skip when quant model inference but precision is not int8
set_precision=$(func_set_params "${precision_key}" "${precision}")
_save_log_path="${_log_path}/python_infer_cpu_gpus_${_gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
set_mkldnn=$(func_set_params "${use_mkldnn_key}" "${use_mkldnn}")
set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}")
set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_params0} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 "
eval $command
last_status=${PIPESTATUS[0]}
eval "cat ${_save_log_path}"
status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
done
done
done
done
elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
for use_trt in ${use_trt_list[*]}; do
for precision in ${precision_list[*]}; do
if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
continue
fi
if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
continue
fi
if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
continue
fi
for batch_size in ${batch_size_list[*]}; do
_save_log_path="${_log_path}/python_infer_gpu_gpus_${_gpu}_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}")
set_precision=$(func_set_params "${precision_key}" "${precision}")
set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}")
set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params0} > ${_save_log_path} 2>&1 "
eval $command
last_status=${PIPESTATUS[0]}
eval "cat ${_save_log_path}"
status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
done
done
done
else
echo "Does not support hardware other than CPU and GPU Currently!"
fi
done
}
if [ ${MODE} = "whole_infer" ]; then
GPUID=$3
if [ ${#GPUID} -le 0 ];then
env=" "
else
env="export CUDA_VISIBLE_DEVICES=${GPUID}"
fi
# set CUDA_VISIBLE_DEVICES
eval $env
export Count=0
gpu=0
IFS="|"
infer_run_exports=(${infer_export_list})
infer_quant_flag=(${infer_is_quant})
for infer_model in ${infer_model_dir_list[*]}; do
# run export
if [ ${infer_run_exports[Count]} != "null" ];then
save_infer_dir="${infer_model}"
set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
export_log_path="${LOG_PATH}_export_${Count}.log"
export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 "
echo ${infer_run_exports[Count]}
echo $export_cmd
eval $export_cmd
status_export=$?
status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
else
save_infer_dir=${infer_model}
fi
#run inference
is_quant=${infer_quant_flag[Count]}
func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} "${gpu}"
Count=$(($Count + 1))
done
else
IFS="|"
export Count=0
USE_GPU_KEY=(${train_use_gpu_value})
for gpu in ${gpu_list[*]}; do
train_use_gpu=${USE_GPU_KEY[Count]}
Count=$(($Count + 1))
ips=""
if [ ${gpu} = "-1" ];then
env=""
elif [ ${#gpu} -le 1 ];then
env="export CUDA_VISIBLE_DEVICES=${gpu}"
elif [ ${#gpu} -le 15 ];then
IFS=","
array=(${gpu})
env="export CUDA_VISIBLE_DEVICES=${array[0]}"
IFS="|"
else
IFS=";"
array=(${gpu})
ips=${array[0]}
gpu=${array[1]}
IFS="|"
env=" "
fi
for autocast in ${autocast_list[*]}; do
if [ ${autocast} = "amp" ]; then
set_amp_config="amp.scale_loss=1024.0 amp.use_dynamic_loss_scaling=True amp.amp_level=O2"
else
set_amp_config="amp=None"
fi
for trainer in ${trainer_list[*]}; do
flag_quant=False
if [ ${trainer} = ${pact_key} ]; then
run_train=${pact_trainer}
run_export=${pact_export}
flag_quant=True
elif [ ${trainer} = "${fpgm_key}" ]; then
run_train=${fpgm_trainer}
run_export=${fpgm_export}
elif [ ${trainer} = "${distill_key}" ]; then
run_train=${distill_trainer}
run_export=${distill_export}
elif [ ${trainer} = "${to_static_key}" ]; then
run_train="${norm_trainer} ${to_static_trainer}"
run_export=${norm_export}
elif [[ ${trainer} = ${trainer_key2} ]]; then
run_train=${trainer_value2}
run_export=${export_value2}
else
run_train=${norm_trainer}
run_export=${norm_export}
fi
if [ ${run_train} = "null" ]; then
continue
fi
set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}")
# if length of ips >= 15, then it is seen as multi-machine
# 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0
if [ ${#ips} -le 15 ];then
save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
nodes=1
else
IFS=","
ips_array=(${ips})
IFS="|"
nodes=${#ips_array[@]}
save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
fi
set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}"
elif [ ${#ips} -le 15 ];then # train with multi-gpu
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}"
else # train with multi-machine
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_amp_config} ${set_train_params1}"
fi
# run train
eval $cmd
eval "cat ${save_log}/train.log >> ${save_log}.log"
status_check $? "${cmd}" "${status_log}" "${model_name}" "${save_log}.log"
set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}")
# run eval
if [ ${eval_py} != "null" ]; then
eval ${env}
set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
eval_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log"
eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1 "
eval $eval_cmd
status_check $? "${eval_cmd}" "${status_log}" "${model_name}" "${eval_log_path}"
fi
# run export model
if [ ${run_export} != "null" ]; then
# run export model
save_infer_path="${save_log}"
export_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log"
set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}")
set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}")
export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 "
eval $export_cmd
status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
#run inference
eval $env
save_infer_path="${save_log}"
if [[ ${inference_dir} != "null" ]] && [[ ${inference_dir} != '##' ]]; then
infer_model_dir="${save_infer_path}/${inference_dir}"
else
infer_model_dir=${save_infer_path}
fi
func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" "${gpu}"
eval "unset CUDA_VISIBLE_DEVICES"
fi
done # done with: for trainer in ${trainer_list[*]}; do
done # done with: for autocast in ${autocast_list[*]}; do
done # done with: for gpu in ${gpu_list[*]}; do
fi # end if [ ${MODE} = "infer" ]; then

View File

@ -0,0 +1,3 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/8 13:14
# @Author : zhoujun

View File

@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
# @Time : 2018/6/11 15:54
# @Author : zhoujun
import os
import sys
import pathlib
__dir__ = pathlib.Path(os.path.abspath(__file__))
sys.path.append(str(__dir__))
sys.path.append(str(__dir__.parent.parent))
import argparse
import time
import paddle
from tqdm.auto import tqdm
class EVAL:
def __init__(self, model_path, gpu_id=0):
from models import build_model
from data_loader import get_dataloader
from post_processing import get_post_processing
from utils import get_metric
self.gpu_id = gpu_id
if (
self.gpu_id is not None
and isinstance(self.gpu_id, int)
and paddle.device.is_compiled_with_cuda()
):
paddle.device.set_device("gpu:{}".format(self.gpu_id))
else:
paddle.device.set_device("cpu")
checkpoint = paddle.load(model_path)
config = checkpoint["config"]
config["arch"]["backbone"]["pretrained"] = False
self.validate_loader = get_dataloader(
config["dataset"]["validate"], config["distributed"]
)
self.model = build_model(config["arch"])
self.model.set_state_dict(checkpoint["state_dict"])
self.post_process = get_post_processing(config["post_processing"])
self.metric_cls = get_metric(config["metric"])
def eval(self):
self.model.eval()
raw_metrics = []
total_frame = 0.0
total_time = 0.0
for i, batch in tqdm(
enumerate(self.validate_loader),
total=len(self.validate_loader),
desc="test model",
):
with paddle.no_grad():
start = time.time()
preds = self.model(batch["img"])
boxes, scores = self.post_process(
batch, preds, is_output_polygon=self.metric_cls.is_output_polygon
)
total_frame += batch["img"].shape[0]
total_time += time.time() - start
raw_metric = self.metric_cls.validate_measure(batch, (boxes, scores))
raw_metrics.append(raw_metric)
metrics = self.metric_cls.gather_measure(raw_metrics)
print("FPS:{}".format(total_frame / total_time))
return {
"recall": metrics["recall"].avg,
"precision": metrics["precision"].avg,
"fmeasure": metrics["fmeasure"].avg,
}
def init_args():
parser = argparse.ArgumentParser(description="DBNet.paddle")
parser.add_argument(
"--model_path",
required=False,
default="output/DBNet_resnet18_FPN_DBHead/checkpoint/1.pth",
type=str,
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = init_args()
eval = EVAL(args.model_path)
result = eval.eval()
print(result)

View File

@ -0,0 +1,57 @@
import os
import sys
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "..")))
import argparse
import paddle
from paddle.jit import to_static
from models import build_model
from utils import Config, ArgsParser
def init_args():
parser = ArgsParser()
args = parser.parse_args()
return args
def load_checkpoint(model, checkpoint_path):
"""
load checkpoints
:param checkpoint_path: Checkpoint path to be loaded
"""
checkpoint = paddle.load(checkpoint_path)
model.set_state_dict(checkpoint["state_dict"])
print("load checkpoint from {}".format(checkpoint_path))
def main(config):
model = build_model(config["arch"])
load_checkpoint(model, config["trainer"]["resume_checkpoint"])
model.eval()
save_path = config["trainer"]["output_dir"]
save_path = os.path.join(save_path, "inference")
infer_shape = [3, -1, -1]
model = to_static(
model,
input_spec=[
paddle.static.InputSpec(shape=[None] + infer_shape, dtype="float32")
],
)
paddle.jit.save(model, save_path)
print("inference model is saved to {}".format(save_path))
if __name__ == "__main__":
args = init_args()
assert os.path.exists(args.config_file)
config = Config(args.config_file)
config.merge_dict(args.opt)
main(config.cfg)

View File

@ -0,0 +1,315 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import pathlib
__dir__ = pathlib.Path(os.path.abspath(__file__))
sys.path.append(str(__dir__))
sys.path.append(str(__dir__.parent.parent))
import cv2
import paddle
from paddle import inference
import numpy as np
from PIL import Image
from paddle.vision import transforms
from tools.predict import resize_image
from post_processing import get_post_processing
from utils.util import draw_bbox, save_result
class InferenceEngine(object):
"""InferenceEngine
Inference engine class which contains preprocess, run, postprocess
"""
def __init__(self, args):
"""
Args:
args: Parameters generated using argparser.
Returns: None
"""
super().__init__()
self.args = args
# init inference engine
(
self.predictor,
self.config,
self.input_tensor,
self.output_tensor,
) = self.load_predictor(
os.path.join(args.model_dir, "inference.pdmodel"),
os.path.join(args.model_dir, "inference.pdiparams"),
)
# build transforms
self.transforms = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
),
]
)
# wamrup
if self.args.warmup > 0:
for idx in range(args.warmup):
print(idx)
x = np.random.rand(
1, 3, self.args.crop_size, self.args.crop_size
).astype("float32")
self.input_tensor.copy_from_cpu(x)
self.predictor.run()
self.output_tensor.copy_to_cpu()
self.post_process = get_post_processing(
{
"type": "SegDetectorRepresenter",
"args": {
"thresh": 0.3,
"box_thresh": 0.7,
"max_candidates": 1000,
"unclip_ratio": 1.5,
},
}
)
def load_predictor(self, model_file_path, params_file_path):
"""load_predictor
initialize the inference engine
Args:
model_file_path: inference model path (*.pdmodel)
model_file_path: inference parameter path (*.pdiparams)
Return:
predictor: Predictor created using Paddle Inference.
config: Configuration of the predictor.
input_tensor: Input tensor of the predictor.
output_tensor: Output tensor of the predictor.
"""
args = self.args
config = inference.Config(model_file_path, params_file_path)
if args.use_gpu:
config.enable_use_gpu(1000, 0)
if args.use_tensorrt:
config.enable_tensorrt_engine(
workspace_size=1 << 30,
precision_mode=precision,
max_batch_size=args.max_batch_size,
min_subgraph_size=args.min_subgraph_size, # skip the minimum trt subgraph
use_calib_mode=False,
)
# collect shape
trt_shape_f = os.path.join(model_dir, "_trt_dynamic_shape.txt")
if not os.path.exists(trt_shape_f):
config.collect_shape_range_info(trt_shape_f)
logger.info(f"collect dynamic shape info into : {trt_shape_f}")
try:
config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f, True)
except Exception as E:
logger.info(E)
logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!")
else:
config.disable_gpu()
# The thread num should not be greater than the number of cores in the CPU.
if args.enable_mkldnn:
# cache 10 different shapes for mkldnn to avoid memory leak
config.set_mkldnn_cache_capacity(10)
config.enable_mkldnn()
if args.precision == "fp16":
config.enable_mkldnn_bfloat16()
if hasattr(args, "cpu_threads"):
config.set_cpu_math_library_num_threads(args.cpu_threads)
else:
# default cpu threads as 10
config.set_cpu_math_library_num_threads(10)
# enable memory optim
config.enable_memory_optim()
config.disable_glog_info()
config.switch_use_feed_fetch_ops(False)
config.switch_ir_optim(True)
# create predictor
predictor = inference.create_predictor(config)
# get input and output tensor property
input_names = predictor.get_input_names()
input_tensor = predictor.get_input_handle(input_names[0])
output_names = predictor.get_output_names()
output_tensor = predictor.get_output_handle(output_names[0])
return predictor, config, input_tensor, output_tensor
def preprocess(self, img_path, short_size):
"""preprocess
Preprocess to the input.
Args:
img_path: Image path.
Returns: Input data after preprocess.
"""
img = cv2.imread(img_path, 1)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
h, w = img.shape[:2]
img = resize_image(img, short_size)
img = self.transforms(img)
img = np.expand_dims(img, axis=0)
shape_info = {"shape": [(h, w)]}
return img, shape_info
def postprocess(self, x, shape_info, is_output_polygon):
"""postprocess
Postprocess to the inference engine output.
Args:
x: Inference engine output.
Returns: Output data after argmax.
"""
box_list, score_list = self.post_process(
shape_info, x, is_output_polygon=is_output_polygon
)
box_list, score_list = box_list[0], score_list[0]
if len(box_list) > 0:
if is_output_polygon:
idx = [x.sum() > 0 for x in box_list]
box_list = [box_list[i] for i, v in enumerate(idx) if v]
score_list = [score_list[i] for i, v in enumerate(idx) if v]
else:
idx = (
box_list.reshape(box_list.shape[0], -1).sum(axis=1) > 0
) # 去掉全为0的框
box_list, score_list = box_list[idx], score_list[idx]
else:
box_list, score_list = [], []
return box_list, score_list
def run(self, x):
"""run
Inference process using inference engine.
Args:
x: Input data after preprocess.
Returns: Inference engine output
"""
self.input_tensor.copy_from_cpu(x)
self.predictor.run()
output = self.output_tensor.copy_to_cpu()
return output
def get_args(add_help=True):
"""
parse args
"""
import argparse
def str2bool(v):
return v.lower() in ("true", "t", "1")
parser = argparse.ArgumentParser(
description="PaddlePaddle Classification Training", add_help=add_help
)
parser.add_argument("--model_dir", default=None, help="inference model dir")
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--short_size", default=1024, type=int, help="short size")
parser.add_argument("--img_path", default="./images/demo.jpg")
parser.add_argument("--benchmark", default=False, type=str2bool, help="benchmark")
parser.add_argument("--warmup", default=0, type=int, help="warmup iter")
parser.add_argument("--polygon", action="store_true", help="output polygon or box")
parser.add_argument("--use_gpu", type=str2bool, default=True)
parser.add_argument("--use_tensorrt", type=str2bool, default=False)
parser.add_argument("--precision", type=str, default="fp32")
parser.add_argument("--gpu_mem", type=int, default=500)
parser.add_argument("--gpu_id", type=int, default=0)
parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
parser.add_argument("--cpu_threads", type=int, default=10)
args = parser.parse_args()
return args
def main(args):
"""
Main inference function.
Args:
args: Parameters generated using argparser.
Returns:
class_id: Class index of the input.
prob: : Probability of the input.
"""
inference_engine = InferenceEngine(args)
# init benchmark
if args.benchmark:
import auto_log
autolog = auto_log.AutoLogger(
model_name="db",
batch_size=args.batch_size,
inference_config=inference_engine.config,
gpu_ids="auto" if args.use_gpu else None,
)
# enable benchmark
if args.benchmark:
autolog.times.start()
# preprocess
img, shape_info = inference_engine.preprocess(args.img_path, args.short_size)
if args.benchmark:
autolog.times.stamp()
output = inference_engine.run(img)
if args.benchmark:
autolog.times.stamp()
# postprocess
box_list, score_list = inference_engine.postprocess(
output, shape_info, args.polygon
)
if args.benchmark:
autolog.times.stamp()
autolog.times.end(stamp=True)
autolog.report()
img = draw_bbox(cv2.imread(args.img_path)[:, :, ::-1], box_list)
# 保存结果到路径
os.makedirs("output", exist_ok=True)
img_path = pathlib.Path(args.img_path)
output_path = os.path.join("output", img_path.stem + "_infer_result.jpg")
cv2.imwrite(output_path, img[:, :, ::-1])
save_result(
output_path.replace("_infer_result.jpg", ".txt"),
box_list,
score_list,
args.polygon,
)
if __name__ == "__main__":
args = get_args()
main(args)

View File

@ -0,0 +1,175 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/24 12:06
# @Author : zhoujun
import os
import sys
import pathlib
__dir__ = pathlib.Path(os.path.abspath(__file__))
sys.path.append(str(__dir__))
sys.path.append(str(__dir__.parent.parent))
import time
import cv2
import paddle
from data_loader import get_transforms
from models import build_model
from post_processing import get_post_processing
def resize_image(img, short_size):
height, width, _ = img.shape
if height < width:
new_height = short_size
new_width = new_height / height * width
else:
new_width = short_size
new_height = new_width / width * height
new_height = int(round(new_height / 32) * 32)
new_width = int(round(new_width / 32) * 32)
resized_img = cv2.resize(img, (new_width, new_height))
return resized_img
class PaddleModel:
def __init__(self, model_path, post_p_thre=0.7, gpu_id=None):
"""
初始化模型
:param model_path: 模型地址(可以是模型的参数或者参数和计算图一起保存的文件)
:param gpu_id: 在哪一块gpu上运行
"""
self.gpu_id = gpu_id
if (
self.gpu_id is not None
and isinstance(self.gpu_id, int)
and paddle.device.is_compiled_with_cuda()
):
paddle.device.set_device("gpu:{}".format(self.gpu_id))
else:
paddle.device.set_device("cpu")
checkpoint = paddle.load(model_path)
config = checkpoint["config"]
config["arch"]["backbone"]["pretrained"] = False
self.model = build_model(config["arch"])
self.post_process = get_post_processing(config["post_processing"])
self.post_process.box_thresh = post_p_thre
self.img_mode = config["dataset"]["train"]["dataset"]["args"]["img_mode"]
self.model.set_state_dict(checkpoint["state_dict"])
self.model.eval()
self.transform = []
for t in config["dataset"]["train"]["dataset"]["args"]["transforms"]:
if t["type"] in ["ToTensor", "Normalize"]:
self.transform.append(t)
self.transform = get_transforms(self.transform)
def predict(self, img_path: str, is_output_polygon=False, short_size: int = 1024):
"""
对传入的图像进行预测支持图像地址,opencv 读取图片偏慢
:param img_path: 图像地址
:param is_numpy:
:return:
"""
assert os.path.exists(img_path), "file is not exists"
img = cv2.imread(img_path, 1 if self.img_mode != "GRAY" else 0)
if self.img_mode == "RGB":
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
h, w = img.shape[:2]
img = resize_image(img, short_size)
# 将图片由(w,h)变为(1,img_channel,h,w)
tensor = self.transform(img)
tensor = tensor.unsqueeze_(0)
batch = {"shape": [(h, w)]}
with paddle.no_grad():
start = time.time()
preds = self.model(tensor)
box_list, score_list = self.post_process(
batch, preds, is_output_polygon=is_output_polygon
)
box_list, score_list = box_list[0], score_list[0]
if len(box_list) > 0:
if is_output_polygon:
idx = [x.sum() > 0 for x in box_list]
box_list = [box_list[i] for i, v in enumerate(idx) if v]
score_list = [score_list[i] for i, v in enumerate(idx) if v]
else:
idx = (
box_list.reshape(box_list.shape[0], -1).sum(axis=1) > 0
) # 去掉全为0的框
box_list, score_list = box_list[idx], score_list[idx]
else:
box_list, score_list = [], []
t = time.time() - start
return preds[0, 0, :, :].detach().cpu().numpy(), box_list, score_list, t
def save_depoly(net, input, save_path):
input_spec = [paddle.static.InputSpec(shape=[None, 3, None, None], dtype="float32")]
net = paddle.jit.to_static(net, input_spec=input_spec)
# save static model for inference directly
paddle.jit.save(net, save_path)
def init_args():
import argparse
parser = argparse.ArgumentParser(description="DBNet.paddle")
parser.add_argument("--model_path", default=r"model_best.pth", type=str)
parser.add_argument(
"--input_folder", default="./test/input", type=str, help="img path for predict"
)
parser.add_argument(
"--output_folder", default="./test/output", type=str, help="img path for output"
)
parser.add_argument("--gpu", default=0, type=int, help="gpu for inference")
parser.add_argument(
"--thre", default=0.3, type=float, help="the thresh of post_processing"
)
parser.add_argument("--polygon", action="store_true", help="output polygon or box")
parser.add_argument("--show", action="store_true", help="show result")
parser.add_argument(
"--save_result", action="store_true", help="save box and score to txt file"
)
args = parser.parse_args()
return args
if __name__ == "__main__":
import pathlib
from tqdm import tqdm
import matplotlib.pyplot as plt
from utils.util import show_img, draw_bbox, save_result, get_image_file_list
args = init_args()
print(args)
# 初始化网络
model = PaddleModel(args.model_path, post_p_thre=args.thre, gpu_id=args.gpu)
img_folder = pathlib.Path(args.input_folder)
for img_path in tqdm(get_image_file_list(args.input_folder)):
preds, boxes_list, score_list, t = model.predict(
img_path, is_output_polygon=args.polygon
)
img = draw_bbox(cv2.imread(img_path)[:, :, ::-1], boxes_list)
if args.show:
show_img(preds)
show_img(img, title=os.path.basename(img_path))
plt.show()
# 保存结果到路径
os.makedirs(args.output_folder, exist_ok=True)
img_path = pathlib.Path(img_path)
output_path = os.path.join(args.output_folder, img_path.stem + "_result.jpg")
pred_path = os.path.join(args.output_folder, img_path.stem + "_pred.jpg")
cv2.imwrite(output_path, img[:, :, ::-1])
cv2.imwrite(pred_path, preds * 255)
save_result(
output_path.replace("_result.jpg", ".txt"),
boxes_list,
score_list,
args.polygon,
)

View File

@ -0,0 +1,64 @@
import os
import sys
import pathlib
__dir__ = pathlib.Path(os.path.abspath(__file__))
sys.path.append(str(__dir__))
sys.path.append(str(__dir__.parent.parent))
import paddle
import paddle.distributed as dist
from utils import Config, ArgsParser
def init_args():
parser = ArgsParser()
args = parser.parse_args()
return args
def main(config, profiler_options):
from models import build_model, build_loss
from data_loader import get_dataloader
from trainer import Trainer
from post_processing import get_post_processing
from utils import get_metric
if paddle.device.cuda.device_count() > 1:
dist.init_parallel_env()
config["distributed"] = True
else:
config["distributed"] = False
train_loader = get_dataloader(config["dataset"]["train"], config["distributed"])
assert train_loader is not None
if "validate" in config["dataset"]:
validate_loader = get_dataloader(config["dataset"]["validate"], False)
else:
validate_loader = None
criterion = build_loss(config["loss"])
config["arch"]["backbone"]["in_channels"] = (
3 if config["dataset"]["train"]["dataset"]["args"]["img_mode"] != "GRAY" else 1
)
model = build_model(config["arch"])
# set @to_static for benchmark, skip this by default.
post_p = get_post_processing(config["post_processing"])
metric = get_metric(config["metric"])
trainer = Trainer(
config=config,
model=model,
criterion=criterion,
train_loader=train_loader,
post_process=post_p,
metric_cls=metric,
validate_loader=validate_loader,
profiler_options=profiler_options,
)
trainer.train()
if __name__ == "__main__":
args = init_args()
assert os.path.exists(args.config_file)
config = Config(args.config_file)
config.merge_dict(args.opt)
main(config.cfg, args.profiler_options)

View File

@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:58
# @Author : zhoujun
from .trainer import Trainer

View File

@ -0,0 +1,256 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:58
# @Author : zhoujun
import time
import paddle
from tqdm import tqdm
from base import BaseTrainer
from utils import runningScore, cal_text_score, Polynomial, profiler
class Trainer(BaseTrainer):
def __init__(
self,
config,
model,
criterion,
train_loader,
validate_loader,
metric_cls,
post_process=None,
profiler_options=None,
):
super(Trainer, self).__init__(
config,
model,
criterion,
train_loader,
validate_loader,
metric_cls,
post_process,
)
self.profiler_options = profiler_options
self.enable_eval = config["trainer"].get("enable_eval", True)
def _train_epoch(self, epoch):
self.model.train()
total_samples = 0
train_reader_cost = 0.0
train_batch_cost = 0.0
reader_start = time.time()
epoch_start = time.time()
train_loss = 0.0
running_metric_text = runningScore(2)
for i, batch in enumerate(self.train_loader):
profiler.add_profiler_step(self.profiler_options)
if i >= self.train_loader_len:
break
self.global_step += 1
lr = self.optimizer.get_lr()
cur_batch_size = batch["img"].shape[0]
train_reader_cost += time.time() - reader_start
if self.amp:
with paddle.amp.auto_cast(
enable="gpu" in paddle.device.get_device(),
custom_white_list=self.amp.get("custom_white_list", []),
custom_black_list=self.amp.get("custom_black_list", []),
level=self.amp.get("level", "O2"),
):
preds = self.model(batch["img"])
loss_dict = self.criterion(preds.astype(paddle.float32), batch)
scaled_loss = self.amp["scaler"].scale(loss_dict["loss"])
scaled_loss.backward()
self.amp["scaler"].minimize(self.optimizer, scaled_loss)
else:
preds = self.model(batch["img"])
loss_dict = self.criterion(preds, batch)
# backward
loss_dict["loss"].backward()
self.optimizer.step()
self.lr_scheduler.step()
self.optimizer.clear_grad()
train_batch_time = time.time() - reader_start
train_batch_cost += train_batch_time
total_samples += cur_batch_size
# acc iou
score_shrink_map = cal_text_score(
preds[:, 0, :, :],
batch["shrink_map"],
batch["shrink_mask"],
running_metric_text,
thred=self.config["post_processing"]["args"]["thresh"],
)
# loss 和 acc 记录到日志
loss_str = "loss: {:.4f}, ".format(loss_dict["loss"].item())
for idx, (key, value) in enumerate(loss_dict.items()):
loss_dict[key] = value.item()
if key == "loss":
continue
loss_str += "{}: {:.4f}".format(key, loss_dict[key])
if idx < len(loss_dict) - 1:
loss_str += ", "
train_loss += loss_dict["loss"]
acc = score_shrink_map["Mean Acc"]
iou_shrink_map = score_shrink_map["Mean IoU"]
if self.global_step % self.log_iter == 0:
self.logger_info(
"[{}/{}], [{}/{}], global_step: {}, ips: {:.1f} samples/sec, avg_reader_cost: {:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, acc: {:.4f}, iou_shrink_map: {:.4f}, {}lr:{:.6}, time:{:.2f}".format(
epoch,
self.epochs,
i + 1,
self.train_loader_len,
self.global_step,
total_samples / train_batch_cost,
train_reader_cost / self.log_iter,
train_batch_cost / self.log_iter,
total_samples / self.log_iter,
acc,
iou_shrink_map,
loss_str,
lr,
train_batch_cost,
)
)
total_samples = 0
train_reader_cost = 0.0
train_batch_cost = 0.0
if self.visualdl_enable and paddle.distributed.get_rank() == 0:
# write tensorboard
for key, value in loss_dict.items():
self.writer.add_scalar(
"TRAIN/LOSS/{}".format(key), value, self.global_step
)
self.writer.add_scalar("TRAIN/ACC_IOU/acc", acc, self.global_step)
self.writer.add_scalar(
"TRAIN/ACC_IOU/iou_shrink_map", iou_shrink_map, self.global_step
)
self.writer.add_scalar("TRAIN/lr", lr, self.global_step)
reader_start = time.time()
return {
"train_loss": train_loss / self.train_loader_len,
"lr": lr,
"time": time.time() - epoch_start,
"epoch": epoch,
}
def _eval(self, epoch):
self.model.eval()
raw_metrics = []
total_frame = 0.0
total_time = 0.0
for i, batch in tqdm(
enumerate(self.validate_loader),
total=len(self.validate_loader),
desc="test model",
):
with paddle.no_grad():
start = time.time()
if self.amp:
with paddle.amp.auto_cast(
enable="gpu" in paddle.device.get_device(),
custom_white_list=self.amp.get("custom_white_list", []),
custom_black_list=self.amp.get("custom_black_list", []),
level=self.amp.get("level", "O2"),
):
preds = self.model(batch["img"])
preds = preds.astype(paddle.float32)
else:
preds = self.model(batch["img"])
boxes, scores = self.post_process(
batch, preds, is_output_polygon=self.metric_cls.is_output_polygon
)
total_frame += batch["img"].shape[0]
total_time += time.time() - start
raw_metric = self.metric_cls.validate_measure(batch, (boxes, scores))
raw_metrics.append(raw_metric)
metrics = self.metric_cls.gather_measure(raw_metrics)
self.logger_info("FPS:{}".format(total_frame / total_time))
return metrics["recall"].avg, metrics["precision"].avg, metrics["fmeasure"].avg
def _on_epoch_finish(self):
self.logger_info(
"[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}".format(
self.epoch_result["epoch"],
self.epochs,
self.epoch_result["train_loss"],
self.epoch_result["time"],
self.epoch_result["lr"],
)
)
net_save_path = "{}/model_latest.pth".format(self.checkpoint_dir)
net_save_path_best = "{}/model_best.pth".format(self.checkpoint_dir)
if paddle.distributed.get_rank() == 0:
self._save_checkpoint(self.epoch_result["epoch"], net_save_path)
save_best = False
if (
self.validate_loader is not None
and self.metric_cls is not None
and self.enable_eval
): # 使用f1作为最优模型指标
recall, precision, hmean = self._eval(self.epoch_result["epoch"])
if self.visualdl_enable:
self.writer.add_scalar("EVAL/recall", recall, self.global_step)
self.writer.add_scalar(
"EVAL/precision", precision, self.global_step
)
self.writer.add_scalar("EVAL/hmean", hmean, self.global_step)
self.logger_info(
"test: recall: {:.6f}, precision: {:.6f}, hmean: {:.6f}".format(
recall, precision, hmean
)
)
if hmean >= self.metrics["hmean"]:
save_best = True
self.metrics["train_loss"] = self.epoch_result["train_loss"]
self.metrics["hmean"] = hmean
self.metrics["precision"] = precision
self.metrics["recall"] = recall
self.metrics["best_model_epoch"] = self.epoch_result["epoch"]
else:
if self.epoch_result["train_loss"] <= self.metrics["train_loss"]:
save_best = True
self.metrics["train_loss"] = self.epoch_result["train_loss"]
self.metrics["best_model_epoch"] = self.epoch_result["epoch"]
best_str = "current best, "
for k, v in self.metrics.items():
best_str += "{}: {:.6f}, ".format(k, v)
self.logger_info(best_str)
if save_best:
import shutil
shutil.copy(net_save_path, net_save_path_best)
self.logger_info("Saving current best: {}".format(net_save_path_best))
else:
self.logger_info("Saving checkpoint: {}".format(net_save_path))
def _on_train_finish(self):
if self.enable_eval:
for k, v in self.metrics.items():
self.logger_info("{}:{}".format(k, v))
self.logger_info("finish train")
def _initialize_scheduler(self):
if self.config["lr_scheduler"]["type"] == "Polynomial":
self.config["lr_scheduler"]["args"]["epochs"] = self.config["trainer"][
"epochs"
]
self.config["lr_scheduler"]["args"]["step_each_epoch"] = len(
self.train_loader
)
self.lr_scheduler = Polynomial(**self.config["lr_scheduler"]["args"])()
else:
self.lr_scheduler = self._initialize("lr_scheduler", paddle.optimizer.lr)

View File

@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:58
# @Author : zhoujun
from .util import *
from .metrics import *
from .schedulers import *
from .cal_recall.script import cal_recall_precision_f1
from .ocr_metric import get_metric

View File

@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
# @Time : 1/16/19 6:40 AM
# @Author : zhoujun
from .script import cal_recall_precision_f1
__all__ = ["cal_recall_precision_f1"]

View File

@ -0,0 +1,494 @@
#!/usr/bin/env python2
# encoding: UTF-8
import json
import sys
sys.path.append("./")
import zipfile
import re
import sys
import os
import codecs
import traceback
import numpy as np
from utils import order_points_clockwise
def print_help():
sys.stdout.write(
"Usage: python %s.py -g=<gtFile> -s=<submFile> [-o=<outputFolder> -p=<jsonParams>]"
% sys.argv[0]
)
sys.exit(2)
def load_zip_file_keys(file, fileNameRegExp=""):
"""
Returns an array with the entries of the ZIP file that match with the regular expression.
The key's are the names or the file or the capturing group defined in the fileNameRegExp
"""
try:
archive = zipfile.ZipFile(file, mode="r", allowZip64=True)
except:
raise Exception("Error loading the ZIP archive.")
pairs = []
for name in archive.namelist():
addFile = True
keyName = name
if fileNameRegExp != "":
m = re.match(fileNameRegExp, name)
if m == None:
addFile = False
else:
if len(m.groups()) > 0:
keyName = m.group(1)
if addFile:
pairs.append(keyName)
return pairs
def load_zip_file(file, fileNameRegExp="", allEntries=False):
"""
Returns an array with the contents (filtered by fileNameRegExp) of a ZIP file.
The key's are the names or the file or the capturing group defined in the fileNameRegExp
allEntries validates that all entries in the ZIP file pass the fileNameRegExp
"""
try:
archive = zipfile.ZipFile(file, mode="r", allowZip64=True)
except:
raise Exception("Error loading the ZIP archive")
pairs = []
for name in archive.namelist():
addFile = True
keyName = name
if fileNameRegExp != "":
m = re.match(fileNameRegExp, name)
if m == None:
addFile = False
else:
if len(m.groups()) > 0:
keyName = m.group(1)
if addFile:
pairs.append([keyName, archive.read(name)])
else:
if allEntries:
raise Exception("ZIP entry not valid: %s" % name)
return dict(pairs)
def load_folder_file(file, fileNameRegExp="", allEntries=False):
"""
Returns an array with the contents (filtered by fileNameRegExp) of a ZIP file.
The key's are the names or the file or the capturing group defined in the fileNameRegExp
allEntries validates that all entries in the ZIP file pass the fileNameRegExp
"""
pairs = []
for name in os.listdir(file):
addFile = True
keyName = name
if fileNameRegExp != "":
m = re.match(fileNameRegExp, name)
if m == None:
addFile = False
else:
if len(m.groups()) > 0:
keyName = m.group(1)
if addFile:
pairs.append([keyName, open(os.path.join(file, name)).read()])
else:
if allEntries:
raise Exception("ZIP entry not valid: %s" % name)
return dict(pairs)
def decode_utf8(raw):
"""
Returns a Unicode object on success, or None on failure
"""
try:
raw = codecs.decode(raw, "utf-8", "replace")
# extracts BOM if exists
raw = raw.encode("utf8")
if raw.startswith(codecs.BOM_UTF8):
raw = raw.replace(codecs.BOM_UTF8, "", 1)
return raw.decode("utf-8")
except:
return None
def validate_lines_in_file(
fileName,
file_contents,
CRLF=True,
LTRB=True,
withTranscription=False,
withConfidence=False,
imWidth=0,
imHeight=0,
):
"""
This function validates that all lines of the file calling the Line validation function for each line
"""
utf8File = decode_utf8(file_contents)
if utf8File is None:
raise Exception("The file %s is not UTF-8" % fileName)
lines = utf8File.split("\r\n" if CRLF else "\n")
for line in lines:
line = line.replace("\r", "").replace("\n", "")
if line != "":
try:
validate_tl_line(
line, LTRB, withTranscription, withConfidence, imWidth, imHeight
)
except Exception as e:
raise Exception(
(
"Line in sample not valid. Sample: %s Line: %s Error: %s"
% (fileName, line, str(e))
).encode("utf-8", "replace")
)
def validate_tl_line(
line, LTRB=True, withTranscription=True, withConfidence=True, imWidth=0, imHeight=0
):
"""
Validate the format of the line. If the line is not valid an exception will be raised.
If maxWidth and maxHeight are specified, all points must be inside the image bounds.
Possible values are:
LTRB=True: xmin,ymin,xmax,ymax[,confidence][,transcription]
LTRB=False: x1,y1,x2,y2,x3,y3,x4,y4[,confidence][,transcription]
"""
get_tl_line_values(line, LTRB, withTranscription, withConfidence, imWidth, imHeight)
def get_tl_line_values(
line,
LTRB=True,
withTranscription=False,
withConfidence=False,
imWidth=0,
imHeight=0,
):
"""
Validate the format of the line. If the line is not valid an exception will be raised.
If maxWidth and maxHeight are specified, all points must be inside the image bounds.
Possible values are:
LTRB=True: xmin,ymin,xmax,ymax[,confidence][,transcription]
LTRB=False: x1,y1,x2,y2,x3,y3,x4,y4[,confidence][,transcription]
Returns values from a textline. Points , [Confidences], [Transcriptions]
"""
confidence = 0.0
transcription = ""
points = []
numPoints = 4
if LTRB:
numPoints = 4
if withTranscription and withConfidence:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$",
line,
)
if m == None:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$",
line,
)
raise Exception(
"Format incorrect. Should be: xmin,ymin,xmax,ymax,confidence,transcription"
)
elif withConfidence:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*$",
line,
)
if m == None:
raise Exception(
"Format incorrect. Should be: xmin,ymin,xmax,ymax,confidence"
)
elif withTranscription:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,(.*)$",
line,
)
if m == None:
raise Exception(
"Format incorrect. Should be: xmin,ymin,xmax,ymax,transcription"
)
else:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,?\s*$",
line,
)
if m == None:
raise Exception("Format incorrect. Should be: xmin,ymin,xmax,ymax")
xmin = int(m.group(1))
ymin = int(m.group(2))
xmax = int(m.group(3))
ymax = int(m.group(4))
if xmax < xmin:
raise Exception("Xmax value (%s) not valid (Xmax < Xmin)." % (xmax))
if ymax < ymin:
raise Exception("Ymax value (%s) not valid (Ymax < Ymin)." % (ymax))
points = [float(m.group(i)) for i in range(1, (numPoints + 1))]
if imWidth > 0 and imHeight > 0:
validate_point_inside_bounds(xmin, ymin, imWidth, imHeight)
validate_point_inside_bounds(xmax, ymax, imWidth, imHeight)
else:
numPoints = 8
if withTranscription and withConfidence:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$",
line,
)
if m == None:
raise Exception(
"Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,confidence,transcription"
)
elif withConfidence:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-1].?[0-9]*)\s*$",
line,
)
if m == None:
raise Exception(
"Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,confidence"
)
elif withTranscription:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,(.*)$",
line,
)
if m == None:
raise Exception(
"Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,transcription"
)
else:
m = re.match(
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*$",
line,
)
if m == None:
raise Exception("Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4")
points = [float(m.group(i)) for i in range(1, (numPoints + 1))]
points = order_points_clockwise(np.array(points).reshape(-1, 2)).reshape(-1)
validate_clockwise_points(points)
if imWidth > 0 and imHeight > 0:
validate_point_inside_bounds(points[0], points[1], imWidth, imHeight)
validate_point_inside_bounds(points[2], points[3], imWidth, imHeight)
validate_point_inside_bounds(points[4], points[5], imWidth, imHeight)
validate_point_inside_bounds(points[6], points[7], imWidth, imHeight)
if withConfidence:
try:
confidence = float(m.group(numPoints + 1))
except ValueError:
raise Exception("Confidence value must be a float")
if withTranscription:
posTranscription = numPoints + (2 if withConfidence else 1)
transcription = m.group(posTranscription)
m2 = re.match(r"^\s*\"(.*)\"\s*$", transcription)
if (
m2 != None
): # Transcription with double quotes, we extract the value and replace escaped characters
transcription = m2.group(1).replace("\\\\", "\\").replace('\\"', '"')
return points, confidence, transcription
def validate_point_inside_bounds(x, y, imWidth, imHeight):
if x < 0 or x > imWidth:
raise Exception(
"X value (%s) not valid. Image dimensions: (%s,%s)"
% (xmin, imWidth, imHeight)
)
if y < 0 or y > imHeight:
raise Exception(
"Y value (%s) not valid. Image dimensions: (%s,%s) Sample: %s Line:%s"
% (ymin, imWidth, imHeight)
)
def validate_clockwise_points(points):
"""
Validates that the points that the 4 points that dlimite a polygon are in clockwise order.
"""
if len(points) != 8:
raise Exception("Points list not valid." + str(len(points)))
point = [
[int(points[0]), int(points[1])],
[int(points[2]), int(points[3])],
[int(points[4]), int(points[5])],
[int(points[6]), int(points[7])],
]
edge = [
(point[1][0] - point[0][0]) * (point[1][1] + point[0][1]),
(point[2][0] - point[1][0]) * (point[2][1] + point[1][1]),
(point[3][0] - point[2][0]) * (point[3][1] + point[2][1]),
(point[0][0] - point[3][0]) * (point[0][1] + point[3][1]),
]
summatory = edge[0] + edge[1] + edge[2] + edge[3]
if summatory > 0:
raise Exception(
"Points are not clockwise. The coordinates of bounding quadrilaterals have to be given in clockwise order. Regarding the correct interpretation of 'clockwise' remember that the image coordinate system used is the standard one, with the image origin at the upper left, the X axis extending to the right and Y axis extending downwards."
)
def get_tl_line_values_from_file_contents(
content,
CRLF=True,
LTRB=True,
withTranscription=False,
withConfidence=False,
imWidth=0,
imHeight=0,
sort_by_confidences=True,
):
"""
Returns all points, confindences and transcriptions of a file in lists. Valid line formats:
xmin,ymin,xmax,ymax,[confidence],[transcription]
x1,y1,x2,y2,x3,y3,x4,y4,[confidence],[transcription]
"""
pointsList = []
transcriptionsList = []
confidencesList = []
lines = content.split("\r\n" if CRLF else "\n")
for line in lines:
line = line.replace("\r", "").replace("\n", "")
if line != "":
points, confidence, transcription = get_tl_line_values(
line, LTRB, withTranscription, withConfidence, imWidth, imHeight
)
pointsList.append(points)
transcriptionsList.append(transcription)
confidencesList.append(confidence)
if withConfidence and len(confidencesList) > 0 and sort_by_confidences:
import numpy as np
sorted_ind = np.argsort(-np.array(confidencesList))
confidencesList = [confidencesList[i] for i in sorted_ind]
pointsList = [pointsList[i] for i in sorted_ind]
transcriptionsList = [transcriptionsList[i] for i in sorted_ind]
return pointsList, confidencesList, transcriptionsList
def main_evaluation(
p,
default_evaluation_params_fn,
validate_data_fn,
evaluate_method_fn,
show_result=True,
per_sample=True,
):
"""
This process validates a method, evaluates it and if it succeed generates a ZIP file with a JSON entry for each sample.
Params:
p: Dictionary of parameters with the GT/submission locations. If None is passed, the parameters send by the system are used.
default_evaluation_params_fn: points to a function that returns a dictionary with the default parameters used for the evaluation
validate_data_fn: points to a method that validates the correct format of the submission
evaluate_method_fn: points to a function that evaluated the submission and return a Dictionary with the results
"""
evalParams = default_evaluation_params_fn()
if "p" in p.keys():
evalParams.update(
p["p"] if isinstance(p["p"], dict) else json.loads(p["p"][1:-1])
)
resDict = {"calculated": True, "Message": "", "method": "{}", "per_sample": "{}"}
try:
# validate_data_fn(p['g'], p['s'], evalParams)
evalData = evaluate_method_fn(p["g"], p["s"], evalParams)
resDict.update(evalData)
except Exception as e:
traceback.print_exc()
resDict["Message"] = str(e)
resDict["calculated"] = False
if "o" in p:
if not os.path.exists(p["o"]):
os.makedirs(p["o"])
resultsOutputname = p["o"] + "/results.zip"
outZip = zipfile.ZipFile(resultsOutputname, mode="w", allowZip64=True)
del resDict["per_sample"]
if "output_items" in resDict.keys():
del resDict["output_items"]
outZip.writestr("method.json", json.dumps(resDict))
if not resDict["calculated"]:
if show_result:
sys.stderr.write("Error!\n" + resDict["Message"] + "\n\n")
if "o" in p:
outZip.close()
return resDict
if "o" in p:
if per_sample == True:
for k, v in evalData["per_sample"].iteritems():
outZip.writestr(k + ".json", json.dumps(v))
if "output_items" in evalData.keys():
for k, v in evalData["output_items"].iteritems():
outZip.writestr(k, v)
outZip.close()
if show_result:
sys.stdout.write("Calculated!")
sys.stdout.write(json.dumps(resDict["method"]))
return resDict
def main_validation(default_evaluation_params_fn, validate_data_fn):
"""
This process validates a method
Params:
default_evaluation_params_fn: points to a function that returns a dictionary with the default parameters used for the evaluation
validate_data_fn: points to a method that validates the correct format of the submission
"""
try:
p = dict([s[1:].split("=") for s in sys.argv[1:]])
evalParams = default_evaluation_params_fn()
if "p" in p.keys():
evalParams.update(
p["p"] if isinstance(p["p"], dict) else json.loads(p["p"][1:-1])
)
validate_data_fn(p["g"], p["s"], evalParams)
print("SUCCESS")
sys.exit(0)
except Exception as e:
print(str(e))
sys.exit(101)

View File

@ -0,0 +1,402 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import namedtuple
from . import rrc_evaluation_funcs
import Polygon as plg
import numpy as np
def default_evaluation_params():
"""
default_evaluation_params: Default parameters to use for the validation and evaluation.
"""
return {
"IOU_CONSTRAINT": 0.5,
"AREA_PRECISION_CONSTRAINT": 0.5,
"GT_SAMPLE_NAME_2_ID": "gt_img_([0-9]+).txt",
"DET_SAMPLE_NAME_2_ID": "res_img_([0-9]+).txt",
"LTRB": False, # LTRB:2points(left,top,right,bottom) or 4 points(x1,y1,x2,y2,x3,y3,x4,y4)
"CRLF": False, # Lines are delimited by Windows CRLF format
"CONFIDENCES": False, # Detections must include confidence value. AP will be calculated
"PER_SAMPLE_RESULTS": True, # Generate per sample results and produce data for visualization
}
def validate_data(gtFilePath, submFilePath, evaluationParams):
"""
Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
Validates also that there are no missing files in the folder.
If some error detected, the method raises the error
"""
gt = rrc_evaluation_funcs.load_folder_file(
gtFilePath, evaluationParams["GT_SAMPLE_NAME_2_ID"]
)
subm = rrc_evaluation_funcs.load_folder_file(
submFilePath, evaluationParams["DET_SAMPLE_NAME_2_ID"], True
)
# Validate format of GroundTruth
for k in gt:
rrc_evaluation_funcs.validate_lines_in_file(
k, gt[k], evaluationParams["CRLF"], evaluationParams["LTRB"], True
)
# Validate format of results
for k in subm:
if (k in gt) == False:
raise Exception("The sample %s not present in GT" % k)
rrc_evaluation_funcs.validate_lines_in_file(
k,
subm[k],
evaluationParams["CRLF"],
evaluationParams["LTRB"],
False,
evaluationParams["CONFIDENCES"],
)
def evaluate_method(gtFilePath, submFilePath, evaluationParams):
"""
Method evaluate_method: evaluate method and returns the results
Results. Dictionary with the following values:
- method (required) Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
- samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
"""
def polygon_from_points(points):
"""
Returns a Polygon object to use with the Polygon2 class from a list of 8 points: x1,y1,x2,y2,x3,y3,x4,y4
"""
resBoxes = np.empty([1, 8], dtype="int32")
resBoxes[0, 0] = int(points[0])
resBoxes[0, 4] = int(points[1])
resBoxes[0, 1] = int(points[2])
resBoxes[0, 5] = int(points[3])
resBoxes[0, 2] = int(points[4])
resBoxes[0, 6] = int(points[5])
resBoxes[0, 3] = int(points[6])
resBoxes[0, 7] = int(points[7])
pointMat = resBoxes[0].reshape([2, 4]).T
return plg.Polygon(pointMat)
def rectangle_to_polygon(rect):
resBoxes = np.empty([1, 8], dtype="int32")
resBoxes[0, 0] = int(rect.xmin)
resBoxes[0, 4] = int(rect.ymax)
resBoxes[0, 1] = int(rect.xmin)
resBoxes[0, 5] = int(rect.ymin)
resBoxes[0, 2] = int(rect.xmax)
resBoxes[0, 6] = int(rect.ymin)
resBoxes[0, 3] = int(rect.xmax)
resBoxes[0, 7] = int(rect.ymax)
pointMat = resBoxes[0].reshape([2, 4]).T
return plg.Polygon(pointMat)
def rectangle_to_points(rect):
points = [
int(rect.xmin),
int(rect.ymax),
int(rect.xmax),
int(rect.ymax),
int(rect.xmax),
int(rect.ymin),
int(rect.xmin),
int(rect.ymin),
]
return points
def get_union(pD, pG):
areaA = pD.area()
areaB = pG.area()
return areaA + areaB - get_intersection(pD, pG)
def get_intersection_over_union(pD, pG):
try:
return get_intersection(pD, pG) / get_union(pD, pG)
except:
return 0
def get_intersection(pD, pG):
pInt = pD & pG
if len(pInt) == 0:
return 0
return pInt.area()
def compute_ap(confList, matchList, numGtCare):
correct = 0
AP = 0
if len(confList) > 0:
confList = np.array(confList)
matchList = np.array(matchList)
sorted_ind = np.argsort(-confList)
confList = confList[sorted_ind]
matchList = matchList[sorted_ind]
for n in range(len(confList)):
match = matchList[n]
if match:
correct += 1
AP += float(correct) / (n + 1)
if numGtCare > 0:
AP /= numGtCare
return AP
perSampleMetrics = {}
matchedSum = 0
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
gt = rrc_evaluation_funcs.load_folder_file(
gtFilePath, evaluationParams["GT_SAMPLE_NAME_2_ID"]
)
subm = rrc_evaluation_funcs.load_folder_file(
submFilePath, evaluationParams["DET_SAMPLE_NAME_2_ID"], True
)
numGlobalCareGt = 0
numGlobalCareDet = 0
arrGlobalConfidences = []
arrGlobalMatches = []
for resFile in gt:
gtFile = gt[resFile] # rrc_evaluation_funcs.decode_utf8(gt[resFile])
recall = 0
precision = 0
hmean = 0
detMatched = 0
iouMat = np.empty([1, 1])
gtPols = []
detPols = []
gtPolPoints = []
detPolPoints = []
# Array of Ground Truth Polygons' keys marked as don't Care
gtDontCarePolsNum = []
# Array of Detected Polygons' matched with a don't Care GT
detDontCarePolsNum = []
pairs = []
detMatchedNums = []
arrSampleConfidences = []
arrSampleMatch = []
sampleAP = 0
evaluationLog = ""
(
pointsList,
_,
transcriptionsList,
) = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
gtFile, evaluationParams["CRLF"], evaluationParams["LTRB"], True, False
)
for n in range(len(pointsList)):
points = pointsList[n]
transcription = transcriptionsList[n]
dontCare = transcription == "###"
if evaluationParams["LTRB"]:
gtRect = Rectangle(*points)
gtPol = rectangle_to_polygon(gtRect)
else:
gtPol = polygon_from_points(points)
gtPols.append(gtPol)
gtPolPoints.append(points)
if dontCare:
gtDontCarePolsNum.append(len(gtPols) - 1)
evaluationLog += (
"GT polygons: "
+ str(len(gtPols))
+ (
" (" + str(len(gtDontCarePolsNum)) + " don't care)\n"
if len(gtDontCarePolsNum) > 0
else "\n"
)
)
if resFile in subm:
detFile = subm[resFile] # rrc_evaluation_funcs.decode_utf8(subm[resFile])
(
pointsList,
confidencesList,
_,
) = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
detFile,
evaluationParams["CRLF"],
evaluationParams["LTRB"],
False,
evaluationParams["CONFIDENCES"],
)
for n in range(len(pointsList)):
points = pointsList[n]
if evaluationParams["LTRB"]:
detRect = Rectangle(*points)
detPol = rectangle_to_polygon(detRect)
else:
detPol = polygon_from_points(points)
detPols.append(detPol)
detPolPoints.append(points)
if len(gtDontCarePolsNum) > 0:
for dontCarePol in gtDontCarePolsNum:
dontCarePol = gtPols[dontCarePol]
intersected_area = get_intersection(dontCarePol, detPol)
pdDimensions = detPol.area()
precision = (
0 if pdDimensions == 0 else intersected_area / pdDimensions
)
if precision > evaluationParams["AREA_PRECISION_CONSTRAINT"]:
detDontCarePolsNum.append(len(detPols) - 1)
break
evaluationLog += (
"DET polygons: "
+ str(len(detPols))
+ (
" (" + str(len(detDontCarePolsNum)) + " don't care)\n"
if len(detDontCarePolsNum) > 0
else "\n"
)
)
if len(gtPols) > 0 and len(detPols) > 0:
# Calculate IoU and precision matrixs
outputShape = [len(gtPols), len(detPols)]
iouMat = np.empty(outputShape)
gtRectMat = np.zeros(len(gtPols), np.int8)
detRectMat = np.zeros(len(detPols), np.int8)
for gtNum in range(len(gtPols)):
for detNum in range(len(detPols)):
pG = gtPols[gtNum]
pD = detPols[detNum]
iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
for gtNum in range(len(gtPols)):
for detNum in range(len(detPols)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCarePolsNum
and detNum not in detDontCarePolsNum
):
if (
iouMat[gtNum, detNum]
> evaluationParams["IOU_CONSTRAINT"]
):
gtRectMat[gtNum] = 1
detRectMat[detNum] = 1
detMatched += 1
pairs.append({"gt": gtNum, "det": detNum})
detMatchedNums.append(detNum)
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ "\n"
)
if evaluationParams["CONFIDENCES"]:
for detNum in range(len(detPols)):
if detNum not in detDontCarePolsNum:
# we exclude the don't care detections
match = detNum in detMatchedNums
arrSampleConfidences.append(confidencesList[detNum])
arrSampleMatch.append(match)
arrGlobalConfidences.append(confidencesList[detNum])
arrGlobalMatches.append(match)
numGtCare = len(gtPols) - len(gtDontCarePolsNum)
numDetCare = len(detPols) - len(detDontCarePolsNum)
if numGtCare == 0:
recall = float(1)
precision = float(0) if numDetCare > 0 else float(1)
sampleAP = precision
else:
recall = float(detMatched) / numGtCare
precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare
if (
evaluationParams["CONFIDENCES"]
and evaluationParams["PER_SAMPLE_RESULTS"]
):
sampleAP = compute_ap(arrSampleConfidences, arrSampleMatch, numGtCare)
hmean = (
0
if (precision + recall) == 0
else 2.0 * precision * recall / (precision + recall)
)
matchedSum += detMatched
numGlobalCareGt += numGtCare
numGlobalCareDet += numDetCare
if evaluationParams["PER_SAMPLE_RESULTS"]:
perSampleMetrics[resFile] = {
"precision": precision,
"recall": recall,
"hmean": hmean,
"pairs": pairs,
"AP": sampleAP,
"iouMat": [] if len(detPols) > 100 else iouMat.tolist(),
"gtPolPoints": gtPolPoints,
"detPolPoints": detPolPoints,
"gtDontCare": gtDontCarePolsNum,
"detDontCare": detDontCarePolsNum,
"evaluationParams": evaluationParams,
"evaluationLog": evaluationLog,
}
# Compute MAP and MAR
AP = 0
if evaluationParams["CONFIDENCES"]:
AP = compute_ap(arrGlobalConfidences, arrGlobalMatches, numGlobalCareGt)
methodRecall = 0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt
methodPrecision = (
0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet
)
methodHmean = (
0
if methodRecall + methodPrecision == 0
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
)
methodMetrics = {
"precision": methodPrecision,
"recall": methodRecall,
"hmean": methodHmean,
"AP": AP,
}
resDict = {
"calculated": True,
"Message": "",
"method": methodMetrics,
"per_sample": perSampleMetrics,
}
return resDict
def cal_recall_precision_f1(gt_path, result_path, show_result=False):
p = {"g": gt_path, "s": result_path}
result = rrc_evaluation_funcs.main_evaluation(
p, default_evaluation_params, validate_data, evaluate_method, show_result
)
return result["method"]

View File

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/7 14:46
# @Author : zhoujun
import numpy as np
import cv2
import os
import random
from tqdm import tqdm
# calculate means and std
train_txt_path = "./train_val_list.txt"
CNum = 10000 # 挑选多少图片进行计算
img_h, img_w = 640, 640
imgs = np.zeros([img_w, img_h, 3, 1])
means, stdevs = [], []
with open(train_txt_path, "r") as f:
lines = f.readlines()
random.shuffle(lines) # shuffle , 随机挑选图片
for i in tqdm(range(CNum)):
img_path = lines[i].split("\t")[0]
img = cv2.imread(img_path)
img = cv2.resize(img, (img_h, img_w))
img = img[:, :, :, np.newaxis]
imgs = np.concatenate((imgs, img), axis=3)
# print(i)
imgs = imgs.astype(np.float32) / 255.0
for i in tqdm(range(3)):
pixels = imgs[:, :, i, :].ravel() # 拉成一行
means.append(np.mean(pixels))
stdevs.append(np.std(pixels))
# cv2 读取的图像格式为BGRPIL/Skimage读取到的都是RGB不用转
means.reverse() # BGR --> RGB
stdevs.reverse()
print("normMean = {}".format(means))
print("normStd = {}".format(stdevs))
print("transforms.Normalize(normMean = {}, normStd = {})".format(means, stdevs))

View File

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/24 12:06
# @Author : zhoujun
import os
import glob
import pathlib
data_path = r"test"
# data_path/img 存放图片
# data_path/gt 存放标签文件
f_w = open(os.path.join(data_path, "test.txt"), "w", encoding="utf8")
for img_path in glob.glob(data_path + "/img/*.jpg", recursive=True):
d = pathlib.Path(img_path)
label_path = os.path.join(data_path, "gt", ("gt_" + str(d.stem) + ".txt"))
if os.path.exists(img_path) and os.path.exists(label_path):
print(img_path, label_path)
else:
print("不存在", img_path, label_path)
f_w.write("{}\t{}\n".format(img_path, label_path))
f_w.close()

View File

@ -0,0 +1,60 @@
# Adapted from score written by wkentaro
# https://github.com/wkentaro/pytorch-fcn/blob/master/torchfcn/utils.py
import numpy as np
class runningScore(object):
def __init__(self, n_classes):
self.n_classes = n_classes
self.confusion_matrix = np.zeros((n_classes, n_classes))
def _fast_hist(self, label_true, label_pred, n_class):
mask = (label_true >= 0) & (label_true < n_class)
if np.sum((label_pred[mask] < 0)) > 0:
print(label_pred[label_pred < 0])
hist = np.bincount(
n_class * label_true[mask].astype(int) + label_pred[mask],
minlength=n_class**2,
).reshape(n_class, n_class)
return hist
def update(self, label_trues, label_preds):
# print label_trues.dtype, label_preds.dtype
for lt, lp in zip(label_trues, label_preds):
try:
self.confusion_matrix += self._fast_hist(
lt.flatten(), lp.flatten(), self.n_classes
)
except:
pass
def get_scores(self):
"""Returns accuracy score evaluation result.
- overall accuracy
- mean accuracy
- mean IU
- fwavacc
"""
hist = self.confusion_matrix
acc = np.diag(hist).sum() / (hist.sum() + 0.0001)
acc_cls = np.diag(hist) / (hist.sum(axis=1) + 0.0001)
acc_cls = np.nanmean(acc_cls)
iu = np.diag(hist) / (
hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist) + 0.0001
)
mean_iu = np.nanmean(iu)
freq = hist.sum(axis=1) / (hist.sum() + 0.0001)
fwavacc = (freq[freq > 0] * iu[freq > 0]).sum()
cls_iu = dict(zip(range(self.n_classes), iu))
return {
"Overall Acc": acc,
"Mean Acc": acc_cls,
"FreqW Acc": fwavacc,
"Mean IoU": mean_iu,
}, cls_iu
def reset(self):
self.confusion_matrix = np.zeros((self.n_classes, self.n_classes))

View File

@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/5 15:36
# @Author : zhoujun
from .icdar2015 import QuadMetric
def get_metric(config):
try:
if "args" not in config:
args = {}
else:
args = config["args"]
if isinstance(args, dict):
cls = eval(config["type"])(**args)
else:
cls = eval(config["type"])(args)
return cls
except:
return None

View File

@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
# @Time : 2019/12/5 15:36
# @Author : zhoujun
from .quad_metric import QuadMetric

View File

@ -0,0 +1,474 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import math
from collections import namedtuple
import numpy as np
from shapely.geometry import Polygon
class DetectionDetEvalEvaluator(object):
def __init__(
self,
area_recall_constraint=0.8,
area_precision_constraint=0.4,
ev_param_ind_center_diff_thr=1,
mtype_oo_o=1.0,
mtype_om_o=0.8,
mtype_om_m=1.0,
):
self.area_recall_constraint = area_recall_constraint
self.area_precision_constraint = area_precision_constraint
self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr
self.mtype_oo_o = mtype_oo_o
self.mtype_om_o = mtype_om_o
self.mtype_om_m = mtype_om_m
def evaluate_image(self, gt, pred):
def get_union(pD, pG):
return Polygon(pD).union(Polygon(pG)).area
def get_intersection_over_union(pD, pG):
return get_intersection(pD, pG) / get_union(pD, pG)
def get_intersection(pD, pG):
return Polygon(pD).intersection(Polygon(pG)).area
def one_to_one_match(row, col):
cont = 0
for j in range(len(recallMat[0])):
if (
recallMat[row, j] >= self.area_recall_constraint
and precisionMat[row, j] >= self.area_precision_constraint
):
cont = cont + 1
if cont != 1:
return False
cont = 0
for i in range(len(recallMat)):
if (
recallMat[i, col] >= self.area_recall_constraint
and precisionMat[i, col] >= self.area_precision_constraint
):
cont = cont + 1
if cont != 1:
return False
if (
recallMat[row, col] >= self.area_recall_constraint
and precisionMat[row, col] >= self.area_precision_constraint
):
return True
return False
def num_overlaps_gt(gtNum):
cont = 0
for detNum in range(len(detRects)):
if detNum not in detDontCareRectsNum:
if recallMat[gtNum, detNum] > 0:
cont = cont + 1
return cont
def num_overlaps_det(detNum):
cont = 0
for gtNum in range(len(recallMat)):
if gtNum not in gtDontCareRectsNum:
if recallMat[gtNum, detNum] > 0:
cont = cont + 1
return cont
def is_single_overlap(row, col):
if num_overlaps_gt(row) == 1 and num_overlaps_det(col) == 1:
return True
else:
return False
def one_to_many_match(gtNum):
many_sum = 0
detRects = []
for detNum in range(len(recallMat[0])):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and detNum not in detDontCareRectsNum
):
if precisionMat[gtNum, detNum] >= self.area_precision_constraint:
many_sum += recallMat[gtNum, detNum]
detRects.append(detNum)
if round(many_sum, 4) >= self.area_recall_constraint:
return True, detRects
else:
return False, []
def many_to_one_match(detNum):
many_sum = 0
gtRects = []
for gtNum in range(len(recallMat)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCareRectsNum
):
if recallMat[gtNum, detNum] >= self.area_recall_constraint:
many_sum += precisionMat[gtNum, detNum]
gtRects.append(gtNum)
if round(many_sum, 4) >= self.area_precision_constraint:
return True, gtRects
else:
return False, []
def center_distance(r1, r2):
return ((np.mean(r1, axis=0) - np.mean(r2, axis=0)) ** 2).sum() ** 0.5
def diag(r):
r = np.array(r)
return (
(r[:, 0].max() - r[:, 0].min()) ** 2
+ (r[:, 1].max() - r[:, 1].min()) ** 2
) ** 0.5
perSampleMetrics = {}
recall = 0
precision = 0
hmean = 0
recallAccum = 0.0
precisionAccum = 0.0
gtRects = []
detRects = []
gtPolPoints = []
detPolPoints = []
gtDontCareRectsNum = (
[]
) # Array of Ground Truth Rectangles' keys marked as don't Care
detDontCareRectsNum = (
[]
) # Array of Detected Rectangles' matched with a don't Care GT
pairs = []
evaluationLog = ""
recallMat = np.empty([1, 1])
precisionMat = np.empty([1, 1])
for n in range(len(gt)):
points = gt[n]["points"]
# transcription = gt[n]['text']
dontCare = gt[n]["ignore"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
gtRects.append(points)
gtPolPoints.append(points)
if dontCare:
gtDontCareRectsNum.append(len(gtRects) - 1)
evaluationLog += (
"GT rectangles: "
+ str(len(gtRects))
+ (
" (" + str(len(gtDontCareRectsNum)) + " don't care)\n"
if len(gtDontCareRectsNum) > 0
else "\n"
)
)
for n in range(len(pred)):
points = pred[n]["points"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
detRect = points
detRects.append(detRect)
detPolPoints.append(points)
if len(gtDontCareRectsNum) > 0:
for dontCareRectNum in gtDontCareRectsNum:
dontCareRect = gtRects[dontCareRectNum]
intersected_area = get_intersection(dontCareRect, detRect)
rdDimensions = Polygon(detRect).area
if rdDimensions == 0:
precision = 0
else:
precision = intersected_area / rdDimensions
if precision > self.area_precision_constraint:
detDontCareRectsNum.append(len(detRects) - 1)
break
evaluationLog += (
"DET rectangles: "
+ str(len(detRects))
+ (
" (" + str(len(detDontCareRectsNum)) + " don't care)\n"
if len(detDontCareRectsNum) > 0
else "\n"
)
)
if len(gtRects) == 0:
recall = 1
precision = 0 if len(detRects) > 0 else 1
if len(detRects) > 0:
# Calculate recall and precision matrixes
outputShape = [len(gtRects), len(detRects)]
recallMat = np.empty(outputShape)
precisionMat = np.empty(outputShape)
gtRectMat = np.zeros(len(gtRects), np.int8)
detRectMat = np.zeros(len(detRects), np.int8)
for gtNum in range(len(gtRects)):
for detNum in range(len(detRects)):
rG = gtRects[gtNum]
rD = detRects[detNum]
intersected_area = get_intersection(rG, rD)
rgDimensions = Polygon(rG).area
rdDimensions = Polygon(rD).area
recallMat[gtNum, detNum] = (
0 if rgDimensions == 0 else intersected_area / rgDimensions
)
precisionMat[gtNum, detNum] = (
0 if rdDimensions == 0 else intersected_area / rdDimensions
)
# Find one-to-one matches
evaluationLog += "Find one-to-one matches\n"
for gtNum in range(len(gtRects)):
for detNum in range(len(detRects)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCareRectsNum
and detNum not in detDontCareRectsNum
):
match = one_to_one_match(gtNum, detNum)
if match is True:
# in deteval we have to make other validation before mark as one-to-one
if is_single_overlap(gtNum, detNum) is True:
rG = gtRects[gtNum]
rD = detRects[detNum]
normDist = center_distance(rG, rD)
normDist /= diag(rG) + diag(rD)
normDist *= 2.0
if normDist < self.ev_param_ind_center_diff_thr:
gtRectMat[gtNum] = 1
detRectMat[detNum] = 1
recallAccum += self.mtype_oo_o
precisionAccum += self.mtype_oo_o
pairs.append(
{"gt": gtNum, "det": detNum, "type": "OO"}
)
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ "\n"
)
else:
evaluationLog += (
"Match Discarded GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ " normDist: "
+ str(normDist)
+ " \n"
)
else:
evaluationLog += (
"Match Discarded GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ " not single overlap\n"
)
# Find one-to-many matches
evaluationLog += "Find one-to-many matches\n"
for gtNum in range(len(gtRects)):
if gtNum not in gtDontCareRectsNum:
match, matchesDet = one_to_many_match(gtNum)
if match is True:
evaluationLog += "num_overlaps_gt=" + str(
num_overlaps_gt(gtNum)
)
# in deteval we have to make other validation before mark as one-to-one
if num_overlaps_gt(gtNum) >= 2:
gtRectMat[gtNum] = 1
recallAccum += (
self.mtype_oo_o
if len(matchesDet) == 1
else self.mtype_om_o
)
precisionAccum += (
self.mtype_oo_o
if len(matchesDet) == 1
else self.mtype_om_o * len(matchesDet)
)
pairs.append(
{
"gt": gtNum,
"det": matchesDet,
"type": "OO" if len(matchesDet) == 1 else "OM",
}
)
for detNum in matchesDet:
detRectMat[detNum] = 1
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(matchesDet)
+ "\n"
)
else:
evaluationLog += (
"Match Discarded GT #"
+ str(gtNum)
+ " with Det #"
+ str(matchesDet)
+ " not single overlap\n"
)
# Find many-to-one matches
evaluationLog += "Find many-to-one matches\n"
for detNum in range(len(detRects)):
if detNum not in detDontCareRectsNum:
match, matchesGt = many_to_one_match(detNum)
if match is True:
# in deteval we have to make other validation before mark as one-to-one
if num_overlaps_det(detNum) >= 2:
detRectMat[detNum] = 1
recallAccum += (
self.mtype_oo_o
if len(matchesGt) == 1
else self.mtype_om_m * len(matchesGt)
)
precisionAccum += (
self.mtype_oo_o
if len(matchesGt) == 1
else self.mtype_om_m
)
pairs.append(
{
"gt": matchesGt,
"det": detNum,
"type": "OO" if len(matchesGt) == 1 else "MO",
}
)
for gtNum in matchesGt:
gtRectMat[gtNum] = 1
evaluationLog += (
"Match GT #"
+ str(matchesGt)
+ " with Det #"
+ str(detNum)
+ "\n"
)
else:
evaluationLog += (
"Match Discarded GT #"
+ str(matchesGt)
+ " with Det #"
+ str(detNum)
+ " not single overlap\n"
)
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
if numGtCare == 0:
recall = float(1)
precision = float(0) if len(detRects) > 0 else float(1)
else:
recall = float(recallAccum) / numGtCare
precision = (
float(0)
if (len(detRects) - len(detDontCareRectsNum)) == 0
else float(precisionAccum)
/ (len(detRects) - len(detDontCareRectsNum))
)
hmean = (
0
if (precision + recall) == 0
else 2.0 * precision * recall / (precision + recall)
)
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
numDetCare = len(detRects) - len(detDontCareRectsNum)
perSampleMetrics = {
"precision": precision,
"recall": recall,
"hmean": hmean,
"pairs": pairs,
"recallMat": [] if len(detRects) > 100 else recallMat.tolist(),
"precisionMat": [] if len(detRects) > 100 else precisionMat.tolist(),
"gtPolPoints": gtPolPoints,
"detPolPoints": detPolPoints,
"gtCare": numGtCare,
"detCare": numDetCare,
"gtDontCare": gtDontCareRectsNum,
"detDontCare": detDontCareRectsNum,
"recallAccum": recallAccum,
"precisionAccum": precisionAccum,
"evaluationLog": evaluationLog,
}
return perSampleMetrics
def combine_results(self, results):
numGt = 0
numDet = 0
methodRecallSum = 0
methodPrecisionSum = 0
for result in results:
numGt += result["gtCare"]
numDet += result["detCare"]
methodRecallSum += result["recallAccum"]
methodPrecisionSum += result["precisionAccum"]
methodRecall = 0 if numGt == 0 else methodRecallSum / numGt
methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet
methodHmean = (
0
if methodRecall + methodPrecision == 0
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
)
methodMetrics = {
"precision": methodPrecision,
"recall": methodRecall,
"hmean": methodHmean,
}
return methodMetrics
if __name__ == "__main__":
evaluator = DetectionDetEvalEvaluator()
gts = [
[
{
"points": [(0, 0), (1, 0), (1, 1), (0, 1)],
"text": 1234,
"ignore": False,
},
{
"points": [(2, 2), (3, 2), (3, 3), (2, 3)],
"text": 5678,
"ignore": True,
},
]
]
preds = [
[
{
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
"text": 123,
"ignore": False,
}
]
]
results = []
for gt, pred in zip(gts, preds):
results.append(evaluator.evaluate_image(gt, pred))
metrics = evaluator.combine_results(results)
print(metrics)

View File

@ -0,0 +1,417 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import math
from collections import namedtuple
import numpy as np
from shapely.geometry import Polygon
class DetectionICDAR2013Evaluator(object):
def __init__(
self,
area_recall_constraint=0.8,
area_precision_constraint=0.4,
ev_param_ind_center_diff_thr=1,
mtype_oo_o=1.0,
mtype_om_o=0.8,
mtype_om_m=1.0,
):
self.area_recall_constraint = area_recall_constraint
self.area_precision_constraint = area_precision_constraint
self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr
self.mtype_oo_o = mtype_oo_o
self.mtype_om_o = mtype_om_o
self.mtype_om_m = mtype_om_m
def evaluate_image(self, gt, pred):
def get_union(pD, pG):
return Polygon(pD).union(Polygon(pG)).area
def get_intersection_over_union(pD, pG):
return get_intersection(pD, pG) / get_union(pD, pG)
def get_intersection(pD, pG):
return Polygon(pD).intersection(Polygon(pG)).area
def one_to_one_match(row, col):
cont = 0
for j in range(len(recallMat[0])):
if (
recallMat[row, j] >= self.area_recall_constraint
and precisionMat[row, j] >= self.area_precision_constraint
):
cont = cont + 1
if cont != 1:
return False
cont = 0
for i in range(len(recallMat)):
if (
recallMat[i, col] >= self.area_recall_constraint
and precisionMat[i, col] >= self.area_precision_constraint
):
cont = cont + 1
if cont != 1:
return False
if (
recallMat[row, col] >= self.area_recall_constraint
and precisionMat[row, col] >= self.area_precision_constraint
):
return True
return False
def one_to_many_match(gtNum):
many_sum = 0
detRects = []
for detNum in range(len(recallMat[0])):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and detNum not in detDontCareRectsNum
):
if precisionMat[gtNum, detNum] >= self.area_precision_constraint:
many_sum += recallMat[gtNum, detNum]
detRects.append(detNum)
if round(many_sum, 4) >= self.area_recall_constraint:
return True, detRects
else:
return False, []
def many_to_one_match(detNum):
many_sum = 0
gtRects = []
for gtNum in range(len(recallMat)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCareRectsNum
):
if recallMat[gtNum, detNum] >= self.area_recall_constraint:
many_sum += precisionMat[gtNum, detNum]
gtRects.append(gtNum)
if round(many_sum, 4) >= self.area_precision_constraint:
return True, gtRects
else:
return False, []
def center_distance(r1, r2):
return ((np.mean(r1, axis=0) - np.mean(r2, axis=0)) ** 2).sum() ** 0.5
def diag(r):
r = np.array(r)
return (
(r[:, 0].max() - r[:, 0].min()) ** 2
+ (r[:, 1].max() - r[:, 1].min()) ** 2
) ** 0.5
perSampleMetrics = {}
recall = 0
precision = 0
hmean = 0
recallAccum = 0.0
precisionAccum = 0.0
gtRects = []
detRects = []
gtPolPoints = []
detPolPoints = []
gtDontCareRectsNum = (
[]
) # Array of Ground Truth Rectangles' keys marked as don't Care
detDontCareRectsNum = (
[]
) # Array of Detected Rectangles' matched with a don't Care GT
pairs = []
evaluationLog = ""
recallMat = np.empty([1, 1])
precisionMat = np.empty([1, 1])
for n in range(len(gt)):
points = gt[n]["points"]
# transcription = gt[n]['text']
dontCare = gt[n]["ignore"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
gtRects.append(points)
gtPolPoints.append(points)
if dontCare:
gtDontCareRectsNum.append(len(gtRects) - 1)
evaluationLog += (
"GT rectangles: "
+ str(len(gtRects))
+ (
" (" + str(len(gtDontCareRectsNum)) + " don't care)\n"
if len(gtDontCareRectsNum) > 0
else "\n"
)
)
for n in range(len(pred)):
points = pred[n]["points"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
detRect = points
detRects.append(detRect)
detPolPoints.append(points)
if len(gtDontCareRectsNum) > 0:
for dontCareRectNum in gtDontCareRectsNum:
dontCareRect = gtRects[dontCareRectNum]
intersected_area = get_intersection(dontCareRect, detRect)
rdDimensions = Polygon(detRect).area
if rdDimensions == 0:
precision = 0
else:
precision = intersected_area / rdDimensions
if precision > self.area_precision_constraint:
detDontCareRectsNum.append(len(detRects) - 1)
break
evaluationLog += (
"DET rectangles: "
+ str(len(detRects))
+ (
" (" + str(len(detDontCareRectsNum)) + " don't care)\n"
if len(detDontCareRectsNum) > 0
else "\n"
)
)
if len(gtRects) == 0:
recall = 1
precision = 0 if len(detRects) > 0 else 1
if len(detRects) > 0:
# Calculate recall and precision matrixes
outputShape = [len(gtRects), len(detRects)]
recallMat = np.empty(outputShape)
precisionMat = np.empty(outputShape)
gtRectMat = np.zeros(len(gtRects), np.int8)
detRectMat = np.zeros(len(detRects), np.int8)
for gtNum in range(len(gtRects)):
for detNum in range(len(detRects)):
rG = gtRects[gtNum]
rD = detRects[detNum]
intersected_area = get_intersection(rG, rD)
rgDimensions = Polygon(rG).area
rdDimensions = Polygon(rD).area
recallMat[gtNum, detNum] = (
0 if rgDimensions == 0 else intersected_area / rgDimensions
)
precisionMat[gtNum, detNum] = (
0 if rdDimensions == 0 else intersected_area / rdDimensions
)
# Find one-to-one matches
evaluationLog += "Find one-to-one matches\n"
for gtNum in range(len(gtRects)):
for detNum in range(len(detRects)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCareRectsNum
and detNum not in detDontCareRectsNum
):
match = one_to_one_match(gtNum, detNum)
if match is True:
# in deteval we have to make other validation before mark as one-to-one
rG = gtRects[gtNum]
rD = detRects[detNum]
normDist = center_distance(rG, rD)
normDist /= diag(rG) + diag(rD)
normDist *= 2.0
if normDist < self.ev_param_ind_center_diff_thr:
gtRectMat[gtNum] = 1
detRectMat[detNum] = 1
recallAccum += self.mtype_oo_o
precisionAccum += self.mtype_oo_o
pairs.append({"gt": gtNum, "det": detNum, "type": "OO"})
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ "\n"
)
else:
evaluationLog += (
"Match Discarded GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ " normDist: "
+ str(normDist)
+ " \n"
)
# Find one-to-many matches
evaluationLog += "Find one-to-many matches\n"
for gtNum in range(len(gtRects)):
if gtNum not in gtDontCareRectsNum:
match, matchesDet = one_to_many_match(gtNum)
if match is True:
evaluationLog += "num_overlaps_gt=" + str(
num_overlaps_gt(gtNum)
)
gtRectMat[gtNum] = 1
recallAccum += (
self.mtype_oo_o if len(matchesDet) == 1 else self.mtype_om_o
)
precisionAccum += (
self.mtype_oo_o
if len(matchesDet) == 1
else self.mtype_om_o * len(matchesDet)
)
pairs.append(
{
"gt": gtNum,
"det": matchesDet,
"type": "OO" if len(matchesDet) == 1 else "OM",
}
)
for detNum in matchesDet:
detRectMat[detNum] = 1
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(matchesDet)
+ "\n"
)
# Find many-to-one matches
evaluationLog += "Find many-to-one matches\n"
for detNum in range(len(detRects)):
if detNum not in detDontCareRectsNum:
match, matchesGt = many_to_one_match(detNum)
if match is True:
detRectMat[detNum] = 1
recallAccum += (
self.mtype_oo_o
if len(matchesGt) == 1
else self.mtype_om_m * len(matchesGt)
)
precisionAccum += (
self.mtype_oo_o if len(matchesGt) == 1 else self.mtype_om_m
)
pairs.append(
{
"gt": matchesGt,
"det": detNum,
"type": "OO" if len(matchesGt) == 1 else "MO",
}
)
for gtNum in matchesGt:
gtRectMat[gtNum] = 1
evaluationLog += (
"Match GT #"
+ str(matchesGt)
+ " with Det #"
+ str(detNum)
+ "\n"
)
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
if numGtCare == 0:
recall = float(1)
precision = float(0) if len(detRects) > 0 else float(1)
else:
recall = float(recallAccum) / numGtCare
precision = (
float(0)
if (len(detRects) - len(detDontCareRectsNum)) == 0
else float(precisionAccum)
/ (len(detRects) - len(detDontCareRectsNum))
)
hmean = (
0
if (precision + recall) == 0
else 2.0 * precision * recall / (precision + recall)
)
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
numDetCare = len(detRects) - len(detDontCareRectsNum)
perSampleMetrics = {
"precision": precision,
"recall": recall,
"hmean": hmean,
"pairs": pairs,
"recallMat": [] if len(detRects) > 100 else recallMat.tolist(),
"precisionMat": [] if len(detRects) > 100 else precisionMat.tolist(),
"gtPolPoints": gtPolPoints,
"detPolPoints": detPolPoints,
"gtCare": numGtCare,
"detCare": numDetCare,
"gtDontCare": gtDontCareRectsNum,
"detDontCare": detDontCareRectsNum,
"recallAccum": recallAccum,
"precisionAccum": precisionAccum,
"evaluationLog": evaluationLog,
}
return perSampleMetrics
def combine_results(self, results):
numGt = 0
numDet = 0
methodRecallSum = 0
methodPrecisionSum = 0
for result in results:
numGt += result["gtCare"]
numDet += result["detCare"]
methodRecallSum += result["recallAccum"]
methodPrecisionSum += result["precisionAccum"]
methodRecall = 0 if numGt == 0 else methodRecallSum / numGt
methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet
methodHmean = (
0
if methodRecall + methodPrecision == 0
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
)
methodMetrics = {
"precision": methodPrecision,
"recall": methodRecall,
"hmean": methodHmean,
}
return methodMetrics
if __name__ == "__main__":
evaluator = DetectionICDAR2013Evaluator()
gts = [
[
{
"points": [(0, 0), (1, 0), (1, 1), (0, 1)],
"text": 1234,
"ignore": False,
},
{
"points": [(2, 2), (3, 2), (3, 3), (2, 3)],
"text": 5678,
"ignore": True,
},
]
]
preds = [
[
{
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
"text": 123,
"ignore": False,
}
]
]
results = []
for gt, pred in zip(gts, preds):
results.append(evaluator.evaluate_image(gt, pred))
metrics = evaluator.combine_results(results)
print(metrics)

View File

@ -0,0 +1,300 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import namedtuple
import numpy as np
from shapely.geometry import Polygon
import cv2
def iou_rotate(box_a, box_b, method="union"):
rect_a = cv2.minAreaRect(box_a)
rect_b = cv2.minAreaRect(box_b)
r1 = cv2.rotatedRectangleIntersection(rect_a, rect_b)
if r1[0] == 0:
return 0
else:
inter_area = cv2.contourArea(r1[1])
area_a = cv2.contourArea(box_a)
area_b = cv2.contourArea(box_b)
union_area = area_a + area_b - inter_area
if union_area == 0 or inter_area == 0:
return 0
if method == "union":
iou = inter_area / union_area
elif method == "intersection":
iou = inter_area / min(area_a, area_b)
else:
raise NotImplementedError
return iou
class DetectionIoUEvaluator(object):
def __init__(
self, is_output_polygon=False, iou_constraint=0.5, area_precision_constraint=0.5
):
self.is_output_polygon = is_output_polygon
self.iou_constraint = iou_constraint
self.area_precision_constraint = area_precision_constraint
def evaluate_image(self, gt, pred):
def get_union(pD, pG):
return Polygon(pD).union(Polygon(pG)).area
def get_intersection_over_union(pD, pG):
return get_intersection(pD, pG) / get_union(pD, pG)
def get_intersection(pD, pG):
return Polygon(pD).intersection(Polygon(pG)).area
def compute_ap(confList, matchList, numGtCare):
correct = 0
AP = 0
if len(confList) > 0:
confList = np.array(confList)
matchList = np.array(matchList)
sorted_ind = np.argsort(-confList)
confList = confList[sorted_ind]
matchList = matchList[sorted_ind]
for n in range(len(confList)):
match = matchList[n]
if match:
correct += 1
AP += float(correct) / (n + 1)
if numGtCare > 0:
AP /= numGtCare
return AP
perSampleMetrics = {}
matchedSum = 0
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
numGlobalCareGt = 0
numGlobalCareDet = 0
arrGlobalConfidences = []
arrGlobalMatches = []
recall = 0
precision = 0
hmean = 0
detMatched = 0
iouMat = np.empty([1, 1])
gtPols = []
detPols = []
gtPolPoints = []
detPolPoints = []
# Array of Ground Truth Polygons' keys marked as don't Care
gtDontCarePolsNum = []
# Array of Detected Polygons' matched with a don't Care GT
detDontCarePolsNum = []
pairs = []
detMatchedNums = []
arrSampleConfidences = []
arrSampleMatch = []
evaluationLog = ""
for n in range(len(gt)):
points = gt[n]["points"]
# transcription = gt[n]['text']
dontCare = gt[n]["ignore"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
gtPol = points
gtPols.append(gtPol)
gtPolPoints.append(points)
if dontCare:
gtDontCarePolsNum.append(len(gtPols) - 1)
evaluationLog += (
"GT polygons: "
+ str(len(gtPols))
+ (
" (" + str(len(gtDontCarePolsNum)) + " don't care)\n"
if len(gtDontCarePolsNum) > 0
else "\n"
)
)
for n in range(len(pred)):
points = pred[n]["points"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
detPol = points
detPols.append(detPol)
detPolPoints.append(points)
if len(gtDontCarePolsNum) > 0:
for dontCarePol in gtDontCarePolsNum:
dontCarePol = gtPols[dontCarePol]
intersected_area = get_intersection(dontCarePol, detPol)
pdDimensions = Polygon(detPol).area
precision = (
0 if pdDimensions == 0 else intersected_area / pdDimensions
)
if precision > self.area_precision_constraint:
detDontCarePolsNum.append(len(detPols) - 1)
break
evaluationLog += (
"DET polygons: "
+ str(len(detPols))
+ (
" (" + str(len(detDontCarePolsNum)) + " don't care)\n"
if len(detDontCarePolsNum) > 0
else "\n"
)
)
if len(gtPols) > 0 and len(detPols) > 0:
# Calculate IoU and precision matrixs
outputShape = [len(gtPols), len(detPols)]
iouMat = np.empty(outputShape)
gtRectMat = np.zeros(len(gtPols), np.int8)
detRectMat = np.zeros(len(detPols), np.int8)
if self.is_output_polygon:
for gtNum in range(len(gtPols)):
for detNum in range(len(detPols)):
pG = gtPols[gtNum]
pD = detPols[detNum]
iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
else:
# gtPols = np.float32(gtPols)
# detPols = np.float32(detPols)
for gtNum in range(len(gtPols)):
for detNum in range(len(detPols)):
pG = np.float32(gtPols[gtNum])
pD = np.float32(detPols[detNum])
iouMat[gtNum, detNum] = iou_rotate(pD, pG)
for gtNum in range(len(gtPols)):
for detNum in range(len(detPols)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCarePolsNum
and detNum not in detDontCarePolsNum
):
if iouMat[gtNum, detNum] > self.iou_constraint:
gtRectMat[gtNum] = 1
detRectMat[detNum] = 1
detMatched += 1
pairs.append({"gt": gtNum, "det": detNum})
detMatchedNums.append(detNum)
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ "\n"
)
numGtCare = len(gtPols) - len(gtDontCarePolsNum)
numDetCare = len(detPols) - len(detDontCarePolsNum)
if numGtCare == 0:
recall = float(1)
precision = float(0) if numDetCare > 0 else float(1)
else:
recall = float(detMatched) / numGtCare
precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare
hmean = (
0
if (precision + recall) == 0
else 2.0 * precision * recall / (precision + recall)
)
matchedSum += detMatched
numGlobalCareGt += numGtCare
numGlobalCareDet += numDetCare
perSampleMetrics = {
"precision": precision,
"recall": recall,
"hmean": hmean,
"pairs": pairs,
"iouMat": [] if len(detPols) > 100 else iouMat.tolist(),
"gtPolPoints": gtPolPoints,
"detPolPoints": detPolPoints,
"gtCare": numGtCare,
"detCare": numDetCare,
"gtDontCare": gtDontCarePolsNum,
"detDontCare": detDontCarePolsNum,
"detMatched": detMatched,
"evaluationLog": evaluationLog,
}
return perSampleMetrics
def combine_results(self, results):
numGlobalCareGt = 0
numGlobalCareDet = 0
matchedSum = 0
for result in results:
numGlobalCareGt += result["gtCare"]
numGlobalCareDet += result["detCare"]
matchedSum += result["detMatched"]
methodRecall = (
0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt
)
methodPrecision = (
0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet
)
methodHmean = (
0
if methodRecall + methodPrecision == 0
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
)
methodMetrics = {
"precision": methodPrecision,
"recall": methodRecall,
"hmean": methodHmean,
}
return methodMetrics
if __name__ == "__main__":
evaluator = DetectionIoUEvaluator()
preds = [
[
{
"points": [(0.1, 0.1), (0.5, 0), (0.5, 1), (0, 1)],
"text": 1234,
"ignore": False,
},
{
"points": [(0.5, 0.1), (1, 0), (1, 1), (0.5, 1)],
"text": 5678,
"ignore": False,
},
]
]
gts = [
[
{
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
"text": 123,
"ignore": False,
}
]
]
results = []
for gt, pred in zip(gts, preds):
results.append(evaluator.evaluate_image(gt, pred))
metrics = evaluator.combine_results(results)
print(metrics)

View File

@ -0,0 +1,398 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import math
from collections import namedtuple
import numpy as np
from shapely.geometry import Polygon
class DetectionMTWI2018Evaluator(object):
def __init__(
self,
area_recall_constraint=0.7,
area_precision_constraint=0.7,
ev_param_ind_center_diff_thr=1,
):
self.area_recall_constraint = area_recall_constraint
self.area_precision_constraint = area_precision_constraint
self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr
def evaluate_image(self, gt, pred):
def get_union(pD, pG):
return Polygon(pD).union(Polygon(pG)).area
def get_intersection_over_union(pD, pG):
return get_intersection(pD, pG) / get_union(pD, pG)
def get_intersection(pD, pG):
return Polygon(pD).intersection(Polygon(pG)).area
def one_to_one_match(row, col):
cont = 0
for j in range(len(recallMat[0])):
if (
recallMat[row, j] >= self.area_recall_constraint
and precisionMat[row, j] >= self.area_precision_constraint
):
cont = cont + 1
if cont != 1:
return False
cont = 0
for i in range(len(recallMat)):
if (
recallMat[i, col] >= self.area_recall_constraint
and precisionMat[i, col] >= self.area_precision_constraint
):
cont = cont + 1
if cont != 1:
return False
if (
recallMat[row, col] >= self.area_recall_constraint
and precisionMat[row, col] >= self.area_precision_constraint
):
return True
return False
def one_to_many_match(gtNum):
many_sum = 0
detRects = []
for detNum in range(len(recallMat[0])):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and detNum not in detDontCareRectsNum
):
if precisionMat[gtNum, detNum] >= self.area_precision_constraint:
many_sum += recallMat[gtNum, detNum]
detRects.append(detNum)
if round(many_sum, 4) >= self.area_recall_constraint:
return True, detRects
else:
return False, []
def many_to_one_match(detNum):
many_sum = 0
gtRects = []
for gtNum in range(len(recallMat)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCareRectsNum
):
if recallMat[gtNum, detNum] >= self.area_recall_constraint:
many_sum += precisionMat[gtNum, detNum]
gtRects.append(gtNum)
if round(many_sum, 4) >= self.area_precision_constraint:
return True, gtRects
else:
return False, []
def center_distance(r1, r2):
return ((np.mean(r1, axis=0) - np.mean(r2, axis=0)) ** 2).sum() ** 0.5
def diag(r):
r = np.array(r)
return (
(r[:, 0].max() - r[:, 0].min()) ** 2
+ (r[:, 1].max() - r[:, 1].min()) ** 2
) ** 0.5
perSampleMetrics = {}
recall = 0
precision = 0
hmean = 0
recallAccum = 0.0
precisionAccum = 0.0
gtRects = []
detRects = []
gtPolPoints = []
detPolPoints = []
gtDontCareRectsNum = (
[]
) # Array of Ground Truth Rectangles' keys marked as don't Care
detDontCareRectsNum = (
[]
) # Array of Detected Rectangles' matched with a don't Care GT
pairs = []
evaluationLog = ""
recallMat = np.empty([1, 1])
precisionMat = np.empty([1, 1])
for n in range(len(gt)):
points = gt[n]["points"]
# transcription = gt[n]['text']
dontCare = gt[n]["ignore"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
gtRects.append(points)
gtPolPoints.append(points)
if dontCare:
gtDontCareRectsNum.append(len(gtRects) - 1)
evaluationLog += (
"GT rectangles: "
+ str(len(gtRects))
+ (
" (" + str(len(gtDontCareRectsNum)) + " don't care)\n"
if len(gtDontCareRectsNum) > 0
else "\n"
)
)
for n in range(len(pred)):
points = pred[n]["points"]
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
detRect = points
detRects.append(detRect)
detPolPoints.append(points)
if len(gtDontCareRectsNum) > 0:
for dontCareRectNum in gtDontCareRectsNum:
dontCareRect = gtRects[dontCareRectNum]
intersected_area = get_intersection(dontCareRect, detRect)
rdDimensions = Polygon(detRect).area
if rdDimensions == 0:
precision = 0
else:
precision = intersected_area / rdDimensions
if precision > 0.5:
detDontCareRectsNum.append(len(detRects) - 1)
break
evaluationLog += (
"DET rectangles: "
+ str(len(detRects))
+ (
" (" + str(len(detDontCareRectsNum)) + " don't care)\n"
if len(detDontCareRectsNum) > 0
else "\n"
)
)
if len(gtRects) == 0:
recall = 1
precision = 0 if len(detRects) > 0 else 1
if len(detRects) > 0:
# Calculate recall and precision matrixs
outputShape = [len(gtRects), len(detRects)]
recallMat = np.empty(outputShape)
precisionMat = np.empty(outputShape)
gtRectMat = np.zeros(len(gtRects), np.int8)
detRectMat = np.zeros(len(detRects), np.int8)
for gtNum in range(len(gtRects)):
for detNum in range(len(detRects)):
rG = gtRects[gtNum]
rD = detRects[detNum]
intersected_area = get_intersection(rG, rD)
rgDimensions = Polygon(rG).area
rdDimensions = Polygon(rD).area
recallMat[gtNum, detNum] = (
0 if rgDimensions == 0 else intersected_area / rgDimensions
)
precisionMat[gtNum, detNum] = (
0 if rdDimensions == 0 else intersected_area / rdDimensions
)
# Find one-to-one matches
evaluationLog += "Find one-to-one matches\n"
for gtNum in range(len(gtRects)):
for detNum in range(len(detRects)):
if (
gtRectMat[gtNum] == 0
and detRectMat[detNum] == 0
and gtNum not in gtDontCareRectsNum
and detNum not in detDontCareRectsNum
):
match = one_to_one_match(gtNum, detNum)
if match is True:
# in deteval we have to make other validation before mark as one-to-one
rG = gtRects[gtNum]
rD = detRects[detNum]
normDist = center_distance(rG, rD)
normDist /= diag(rG) + diag(rD)
normDist *= 2.0
if normDist < self.ev_param_ind_center_diff_thr:
gtRectMat[gtNum] = 1
detRectMat[detNum] = 1
recallAccum += 1.0
precisionAccum += 1.0
pairs.append({"gt": gtNum, "det": detNum, "type": "OO"})
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ "\n"
)
else:
evaluationLog += (
"Match Discarded GT #"
+ str(gtNum)
+ " with Det #"
+ str(detNum)
+ " normDist: "
+ str(normDist)
+ " \n"
)
# Find one-to-many matches
evaluationLog += "Find one-to-many matches\n"
for gtNum in range(len(gtRects)):
if gtNum not in gtDontCareRectsNum:
match, matchesDet = one_to_many_match(gtNum)
if match is True:
gtRectMat[gtNum] = 1
recallAccum += 1.0
precisionAccum += len(matchesDet) / (
1 + math.log(len(matchesDet))
)
pairs.append(
{
"gt": gtNum,
"det": matchesDet,
"type": "OO" if len(matchesDet) == 1 else "OM",
}
)
for detNum in matchesDet:
detRectMat[detNum] = 1
evaluationLog += (
"Match GT #"
+ str(gtNum)
+ " with Det #"
+ str(matchesDet)
+ "\n"
)
# Find many-to-one matches
evaluationLog += "Find many-to-one matches\n"
for detNum in range(len(detRects)):
if detNum not in detDontCareRectsNum:
match, matchesGt = many_to_one_match(detNum)
if match is True:
detRectMat[detNum] = 1
recallAccum += len(matchesGt) / (1 + math.log(len(matchesGt)))
precisionAccum += 1.0
pairs.append(
{
"gt": matchesGt,
"det": detNum,
"type": "OO" if len(matchesGt) == 1 else "MO",
}
)
for gtNum in matchesGt:
gtRectMat[gtNum] = 1
evaluationLog += (
"Match GT #"
+ str(matchesGt)
+ " with Det #"
+ str(detNum)
+ "\n"
)
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
if numGtCare == 0:
recall = float(1)
precision = float(0) if len(detRects) > 0 else float(1)
else:
recall = float(recallAccum) / numGtCare
precision = (
float(0)
if (len(detRects) - len(detDontCareRectsNum)) == 0
else float(precisionAccum)
/ (len(detRects) - len(detDontCareRectsNum))
)
hmean = (
0
if (precision + recall) == 0
else 2.0 * precision * recall / (precision + recall)
)
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
numDetCare = len(detRects) - len(detDontCareRectsNum)
perSampleMetrics = {
"precision": precision,
"recall": recall,
"hmean": hmean,
"pairs": pairs,
"recallMat": [] if len(detRects) > 100 else recallMat.tolist(),
"precisionMat": [] if len(detRects) > 100 else precisionMat.tolist(),
"gtPolPoints": gtPolPoints,
"detPolPoints": detPolPoints,
"gtCare": numGtCare,
"detCare": numDetCare,
"gtDontCare": gtDontCareRectsNum,
"detDontCare": detDontCareRectsNum,
"recallAccum": recallAccum,
"precisionAccum": precisionAccum,
"evaluationLog": evaluationLog,
}
return perSampleMetrics
def combine_results(self, results):
numGt = 0
numDet = 0
methodRecallSum = 0
methodPrecisionSum = 0
for result in results:
numGt += result["gtCare"]
numDet += result["detCare"]
methodRecallSum += result["recallAccum"]
methodPrecisionSum += result["precisionAccum"]
methodRecall = 0 if numGt == 0 else methodRecallSum / numGt
methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet
methodHmean = (
0
if methodRecall + methodPrecision == 0
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
)
methodMetrics = {
"precision": methodPrecision,
"recall": methodRecall,
"hmean": methodHmean,
}
return methodMetrics
if __name__ == "__main__":
evaluator = DetectionICDAR2013Evaluator()
gts = [
[
{
"points": [(0, 0), (1, 0), (1, 1), (0, 1)],
"text": 1234,
"ignore": False,
},
{
"points": [(2, 2), (3, 2), (3, 3), (2, 3)],
"text": 5678,
"ignore": True,
},
]
]
preds = [
[
{
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
"text": 123,
"ignore": False,
}
]
]
results = []
for gt, pred in zip(gts, preds):
results.append(evaluator.evaluate_image(gt, pred))
metrics = evaluator.combine_results(results)
print(metrics)

View File

@ -0,0 +1,100 @@
import numpy as np
from .detection.iou import DetectionIoUEvaluator
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
return self
class QuadMetric:
def __init__(self, is_output_polygon=False):
self.is_output_polygon = is_output_polygon
self.evaluator = DetectionIoUEvaluator(is_output_polygon=is_output_polygon)
def measure(self, batch, output, box_thresh=0.6):
"""
batch: (image, polygons, ignore_tags
batch: a dict produced by dataloaders.
image: tensor of shape (N, C, H, W).
polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions.
ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not.
shape: the original shape of images.
filename: the original filenames of images.
output: (polygons, ...)
"""
results = []
gt_polyons_batch = batch["text_polys"]
ignore_tags_batch = batch["ignore_tags"]
pred_polygons_batch = np.array(output[0])
pred_scores_batch = np.array(output[1])
for polygons, pred_polygons, pred_scores, ignore_tags in zip(
gt_polyons_batch, pred_polygons_batch, pred_scores_batch, ignore_tags_batch
):
gt = [
dict(points=np.int64(polygons[i]), ignore=ignore_tags[i])
for i in range(len(polygons))
]
if self.is_output_polygon:
pred = [
dict(points=pred_polygons[i]) for i in range(len(pred_polygons))
]
else:
pred = []
# print(pred_polygons.shape)
for i in range(pred_polygons.shape[0]):
if pred_scores[i] >= box_thresh:
# print(pred_polygons[i,:,:].tolist())
pred.append(
dict(points=pred_polygons[i, :, :].astype(np.int32))
)
# pred = [dict(points=pred_polygons[i,:,:].tolist()) if pred_scores[i] >= box_thresh for i in range(pred_polygons.shape[0])]
results.append(self.evaluator.evaluate_image(gt, pred))
return results
def validate_measure(self, batch, output, box_thresh=0.6):
return self.measure(batch, output, box_thresh)
def evaluate_measure(self, batch, output):
return (
self.measure(batch, output),
np.linspace(0, batch["image"].shape[0]).tolist(),
)
def gather_measure(self, raw_metrics):
raw_metrics = [
image_metrics
for batch_metrics in raw_metrics
for image_metrics in batch_metrics
]
result = self.evaluator.combine_results(raw_metrics)
precision = AverageMeter()
recall = AverageMeter()
fmeasure = AverageMeter()
precision.update(result["precision"], n=len(raw_metrics))
recall.update(result["recall"], n=len(raw_metrics))
fmeasure_score = (
2 * precision.val * recall.val / (precision.val + recall.val + 1e-8)
)
fmeasure.update(fmeasure_score)
return {"precision": precision, "recall": recall, "fmeasure": fmeasure}

View File

@ -0,0 +1,112 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import paddle
# A global variable to record the number of calling times for profiler
# functions. It is used to specify the tracing range of training steps.
_profiler_step_id = 0
# A global variable to avoid parsing from string every time.
_profiler_options = None
class ProfilerOptions(object):
"""
Use a string to initialize a ProfilerOptions.
The string should be in the format: "key1=value1;key2=value;key3=value3".
For example:
"profile_path=model.profile"
"batch_range=[50, 60]; profile_path=model.profile"
"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
ProfilerOptions supports following key-value pair:
batch_range - a integer list, e.g. [100, 110].
state - a string, the optional values are 'CPU', 'GPU' or 'All'.
sorted_key - a string, the optional values are 'calls', 'total',
'max', 'min' or 'ave.
tracer_option - a string, the optional values are 'Default', 'OpDetail',
'AllOpDetail'.
profile_path - a string, the path to save the serialized profile data,
which can be used to generate a timeline.
exit_on_finished - a boolean.
"""
def __init__(self, options_str):
assert isinstance(options_str, str)
self._options = {
"batch_range": [10, 20],
"state": "All",
"sorted_key": "total",
"tracer_option": "Default",
"profile_path": "/tmp/profile",
"exit_on_finished": True,
}
self._parse_from_string(options_str)
def _parse_from_string(self, options_str):
for kv in options_str.replace(" ", "").split(";"):
key, value = kv.split("=")
if key == "batch_range":
value_list = value.replace("[", "").replace("]", "").split(",")
value_list = list(map(int, value_list))
if (
len(value_list) >= 2
and value_list[0] >= 0
and value_list[1] > value_list[0]
):
self._options[key] = value_list
elif key == "exit_on_finished":
self._options[key] = value.lower() in ("yes", "true", "t", "1")
elif key in ["state", "sorted_key", "tracer_option", "profile_path"]:
self._options[key] = value
def __getitem__(self, name):
if self._options.get(name, None) is None:
raise ValueError("ProfilerOptions does not have an option named %s." % name)
return self._options[name]
def add_profiler_step(options_str=None):
"""
Enable the operator-level timing using PaddlePaddle's profiler.
The profiler uses a independent variable to count the profiler steps.
One call of this function is treated as a profiler step.
Args:
profiler_options - a string to initialize the ProfilerOptions.
Default is None, and the profiler is disabled.
"""
if options_str is None:
return
global _profiler_step_id
global _profiler_options
if _profiler_options is None:
_profiler_options = ProfilerOptions(options_str)
if _profiler_step_id == _profiler_options["batch_range"][0]:
paddle.utils.profiler.start_profiler(
_profiler_options["state"], _profiler_options["tracer_option"]
)
elif _profiler_step_id == _profiler_options["batch_range"][1]:
paddle.utils.profiler.stop_profiler(
_profiler_options["sorted_key"], _profiler_options["profile_path"]
)
if _profiler_options["exit_on_finished"]:
sys.exit(0)
_profiler_step_id += 1

View File

@ -0,0 +1,72 @@
from paddle.optimizer import lr
import logging
__all__ = ["Polynomial"]
class Polynomial(object):
"""
Polynomial learning rate decay
Args:
learning_rate (float): The initial learning rate. It is a python float number.
epochs(int): The decay epoch size. It determines the decay cycle, when by_epoch is set to true, it will change to epochs=epochs*step_each_epoch.
step_each_epoch: all steps in each epoch.
end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
power(float, optional): Power of polynomial. Default: 1.0.
warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0, , when by_epoch is set to true, it will change to warmup_epoch=warmup_epoch*step_each_epoch.
warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
by_epoch: Whether the set parameter is based on epoch or iter, when set to true,, epochs and warmup_epoch will be automatically multiplied by step_each_epoch. Default: True
"""
def __init__(
self,
learning_rate,
epochs,
step_each_epoch,
end_lr=0.0,
power=1.0,
warmup_epoch=0,
warmup_start_lr=0.0,
last_epoch=-1,
by_epoch=True,
**kwargs,
):
super().__init__()
if warmup_epoch >= epochs:
msg = f'When using warm up, the value of "epochs" must be greater than value of "Optimizer.lr.warmup_epoch". The value of "Optimizer.lr.warmup_epoch" has been set to {epochs}.'
logging.warning(msg)
warmup_epoch = epochs
self.learning_rate = learning_rate
self.epochs = epochs
self.end_lr = end_lr
self.power = power
self.last_epoch = last_epoch
self.warmup_epoch = warmup_epoch
self.warmup_start_lr = warmup_start_lr
if by_epoch:
self.epochs *= step_each_epoch
self.warmup_epoch = int(self.warmup_epoch * step_each_epoch)
def __call__(self):
learning_rate = (
lr.PolynomialDecay(
learning_rate=self.learning_rate,
decay_steps=self.epochs,
end_lr=self.end_lr,
power=self.power,
last_epoch=self.last_epoch,
)
if self.epochs > 0
else self.learning_rate
)
if self.warmup_epoch > 0:
learning_rate = lr.LinearWarmup(
learning_rate=learning_rate,
warmup_steps=self.warmup_epoch,
start_lr=self.warmup_start_lr,
end_lr=self.learning_rate,
last_epoch=self.last_epoch,
)
return learning_rate

View File

@ -0,0 +1,365 @@
# -*- coding: utf-8 -*-
# @Time : 2019/8/23 21:59
# @Author : zhoujun
import json
import pathlib
import time
import os
import glob
import cv2
import yaml
from typing import Mapping
import matplotlib.pyplot as plt
import numpy as np
from argparse import ArgumentParser, RawDescriptionHelpFormatter
def _check_image_file(path):
img_end = {"jpg", "bmp", "png", "jpeg", "rgb", "tif", "tiff", "gif", "pdf"}
return any([path.lower().endswith(e) for e in img_end])
def get_image_file_list(img_file):
imgs_lists = []
if img_file is None or not os.path.exists(img_file):
raise Exception("not found any img file in {}".format(img_file))
img_end = {"jpg", "bmp", "png", "jpeg", "rgb", "tif", "tiff", "gif", "pdf"}
if os.path.isfile(img_file) and _check_image_file(img_file):
imgs_lists.append(img_file)
elif os.path.isdir(img_file):
for single_file in os.listdir(img_file):
file_path = os.path.join(img_file, single_file)
if os.path.isfile(file_path) and _check_image_file(file_path):
imgs_lists.append(file_path)
if len(imgs_lists) == 0:
raise Exception("not found any img file in {}".format(img_file))
imgs_lists = sorted(imgs_lists)
return imgs_lists
def setup_logger(log_file_path: str = None):
import logging
logging._warn_preinit_stderr = 0
logger = logging.getLogger("DBNet.paddle")
formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
if log_file_path is not None:
file_handle = logging.FileHandler(log_file_path)
file_handle.setFormatter(formatter)
logger.addHandler(file_handle)
logger.setLevel(logging.DEBUG)
return logger
# --exeTime
def exe_time(func):
def newFunc(*args, **args2):
t0 = time.time()
back = func(*args, **args2)
print("{} cost {:.3f}s".format(func.__name__, time.time() - t0))
return back
return newFunc
def load(file_path: str):
file_path = pathlib.Path(file_path)
func_dict = {".txt": _load_txt, ".json": _load_json, ".list": _load_txt}
assert file_path.suffix in func_dict
return func_dict[file_path.suffix](file_path)
def _load_txt(file_path: str):
with open(file_path, "r", encoding="utf8") as f:
content = [
x.strip().strip("\ufeff").strip("\xef\xbb\xbf") for x in f.readlines()
]
return content
def _load_json(file_path: str):
with open(file_path, "r", encoding="utf8") as f:
content = json.load(f)
return content
def save(data, file_path):
file_path = pathlib.Path(file_path)
func_dict = {".txt": _save_txt, ".json": _save_json}
assert file_path.suffix in func_dict
return func_dict[file_path.suffix](data, file_path)
def _save_txt(data, file_path):
"""
将一个list的数组写入txt文件里
:param data:
:param file_path:
:return:
"""
if not isinstance(data, list):
data = [data]
with open(file_path, mode="w", encoding="utf8") as f:
f.write("\n".join(data))
def _save_json(data, file_path):
with open(file_path, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, ensure_ascii=False, indent=4)
def show_img(imgs: np.ndarray, title="img"):
color = len(imgs.shape) == 3 and imgs.shape[-1] == 3
imgs = np.expand_dims(imgs, axis=0)
for i, img in enumerate(imgs):
plt.figure()
plt.title("{}_{}".format(title, i))
plt.imshow(img, cmap=None if color else "gray")
plt.show()
def draw_bbox(img_path, result, color=(255, 0, 0), thickness=2):
if isinstance(img_path, str):
img_path = cv2.imread(img_path)
# img_path = cv2.cvtColor(img_path, cv2.COLOR_BGR2RGB)
img_path = img_path.copy()
for point in result:
point = point.astype(int)
cv2.polylines(img_path, [point], True, color, thickness)
return img_path
def cal_text_score(texts, gt_texts, training_masks, running_metric_text, thred=0.5):
training_masks = training_masks.numpy()
pred_text = texts.numpy() * training_masks
pred_text[pred_text <= thred] = 0
pred_text[pred_text > thred] = 1
pred_text = pred_text.astype(np.int32)
gt_text = gt_texts.numpy() * training_masks
gt_text = gt_text.astype(np.int32)
running_metric_text.update(gt_text, pred_text)
score_text, _ = running_metric_text.get_scores()
return score_text
def order_points_clockwise(pts):
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def order_points_clockwise_list(pts):
pts = pts.tolist()
pts.sort(key=lambda x: (x[1], x[0]))
pts[:2] = sorted(pts[:2], key=lambda x: x[0])
pts[2:] = sorted(pts[2:], key=lambda x: -x[0])
pts = np.array(pts)
return pts
def get_datalist(train_data_path):
"""
获取训练和验证的数据list
:param train_data_path: 训练的dataset文件列表每个文件内以如下格式存储 path/to/img\tlabel
:return:
"""
train_data = []
for p in train_data_path:
with open(p, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip("\n").replace(".jpg ", ".jpg\t").split("\t")
if len(line) > 1:
img_path = pathlib.Path(line[0].strip(" "))
label_path = pathlib.Path(line[1].strip(" "))
if (
img_path.exists()
and img_path.stat().st_size > 0
and label_path.exists()
and label_path.stat().st_size > 0
):
train_data.append((str(img_path), str(label_path)))
return train_data
def save_result(result_path, box_list, score_list, is_output_polygon):
if is_output_polygon:
with open(result_path, "wt") as res:
for i, box in enumerate(box_list):
box = box.reshape(-1).tolist()
result = ",".join([str(int(x)) for x in box])
score = score_list[i]
res.write(result + "," + str(score) + "\n")
else:
with open(result_path, "wt") as res:
for i, box in enumerate(box_list):
score = score_list[i]
box = box.reshape(-1).tolist()
result = ",".join([str(int(x)) for x in box])
res.write(result + "," + str(score) + "\n")
def expand_polygon(polygon):
"""
对只有一个字符的框进行扩充
"""
(x, y), (w, h), angle = cv2.minAreaRect(np.float32(polygon))
if angle < -45:
w, h = h, w
angle += 90
new_w = w + h
box = ((x, y), (new_w, h), angle)
points = cv2.boxPoints(box)
return order_points_clockwise(points)
def _merge_dict(config, merge_dct):
"""Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
updating only top-level keys, dict_merge recurses down into dicts nested
to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
``dct``.
Args:
config: dict onto which the merge is executed
merge_dct: dct merged into config
Returns: dct
"""
for key, value in merge_dct.items():
sub_keys = key.split(".")
key = sub_keys[0]
if key in config and len(sub_keys) > 1:
_merge_dict(config[key], {".".join(sub_keys[1:]): value})
elif (
key in config
and isinstance(config[key], dict)
and isinstance(value, Mapping)
):
_merge_dict(config[key], value)
else:
config[key] = value
return config
def print_dict(cfg, print_func=print, delimiter=0):
"""
Recursively visualize a dict and
indenting acrrording by the relationship of keys.
"""
for k, v in sorted(cfg.items()):
if isinstance(v, dict):
print_func("{}{} : ".format(delimiter * " ", str(k)))
print_dict(v, print_func, delimiter + 4)
elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
print_func("{}{} : ".format(delimiter * " ", str(k)))
for value in v:
print_dict(value, print_func, delimiter + 4)
else:
print_func("{}{} : {}".format(delimiter * " ", k, v))
class Config(object):
def __init__(self, config_path, BASE_KEY="base"):
self.BASE_KEY = BASE_KEY
self.cfg = self._load_config_with_base(config_path)
def _load_config_with_base(self, file_path):
"""
Load config from file.
Args:
file_path (str): Path of the config file to be loaded.
Returns: global config
"""
_, ext = os.path.splitext(file_path)
assert ext in [".yml", ".yaml"], "only support yaml files for now"
with open(file_path) as f:
file_cfg = yaml.load(f, Loader=yaml.Loader)
# NOTE: cfgs outside have higher priority than cfgs in _BASE_
if self.BASE_KEY in file_cfg:
all_base_cfg = dict()
base_ymls = list(file_cfg[self.BASE_KEY])
for base_yml in base_ymls:
with open(base_yml) as f:
base_cfg = self._load_config_with_base(base_yml)
all_base_cfg = _merge_dict(all_base_cfg, base_cfg)
del file_cfg[self.BASE_KEY]
file_cfg = _merge_dict(all_base_cfg, file_cfg)
file_cfg["filename"] = os.path.splitext(os.path.split(file_path)[-1])[0]
return file_cfg
def merge_dict(self, args):
self.cfg = _merge_dict(self.cfg, args)
def print_cfg(self, print_func=print):
"""
Recursively visualize a dict and
indenting according by the relationship of keys.
"""
print_func("----------- Config -----------")
print_dict(self.cfg, print_func)
print_func("---------------------------------------------")
def save(self, p):
with open(p, "w") as f:
yaml.dump(dict(self.cfg), f, default_flow_style=False, sort_keys=False)
class ArgsParser(ArgumentParser):
def __init__(self):
super(ArgsParser, self).__init__(formatter_class=RawDescriptionHelpFormatter)
self.add_argument("-c", "--config_file", help="configuration file to use")
self.add_argument("-o", "--opt", nargs="*", help="set configuration options")
self.add_argument(
"-p",
"--profiler_options",
type=str,
default=None,
help="The option of profiler, which should be in format "
'"key1=value1;key2=value2;key3=value3".',
)
def parse_args(self, argv=None):
args = super(ArgsParser, self).parse_args(argv)
assert (
args.config_file is not None
), "Please specify --config_file=configure_file_path."
args.opt = self._parse_opt(args.opt)
return args
def _parse_opt(self, opts):
config = {}
if not opts:
return config
for s in opts:
s = s.strip()
k, v = s.split("=", 1)
if "." not in k:
config[k] = yaml.load(v, Loader=yaml.Loader)
else:
keys = k.split(".")
if keys[0] not in config:
config[keys[0]] = {}
cur = config[keys[0]]
for idx, key in enumerate(keys[1:]):
if idx == len(keys) - 2:
cur[key] = yaml.load(v, Loader=yaml.Loader)
else:
cur[key] = {}
cur = cur[key]
return config
if __name__ == "__main__":
img = np.zeros((1, 3, 640, 640))
show_img(img[0][0])
plt.show()

View File

@ -0,0 +1,354 @@
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import json
import os
import re
import traceback
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--filename", type=str, help="The name of log which need to analysis."
)
parser.add_argument(
"--log_with_profiler", type=str, help="The path of train log with profiler"
)
parser.add_argument(
"--profiler_path", type=str, help="The path of profiler timeline log."
)
parser.add_argument("--keyword", type=str, help="Keyword to specify analysis data")
parser.add_argument(
"--separator",
type=str,
default=None,
help="Separator of different field in log",
)
parser.add_argument(
"--position", type=int, default=None, help="The position of data field"
)
parser.add_argument(
"--range", type=str, default="", help="The range of data field to intercept"
)
parser.add_argument("--base_batch_size", type=int, help="base_batch size on gpu")
parser.add_argument(
"--skip_steps", type=int, default=0, help="The number of steps to be skipped"
)
parser.add_argument(
"--model_mode", type=int, default=-1, help="Analysis mode, default value is -1"
)
parser.add_argument("--ips_unit", type=str, default=None, help="IPS unit")
parser.add_argument(
"--model_name",
type=str,
default=0,
help="training model_name, transformer_base",
)
parser.add_argument(
"--mission_name", type=str, default=0, help="training mission name"
)
parser.add_argument(
"--direction_id", type=int, default=0, help="training direction_id"
)
parser.add_argument(
"--run_mode", type=str, default="sp", help="multi process or single process"
)
parser.add_argument(
"--index",
type=int,
default=1,
help="{1: speed, 2:mem, 3:profiler, 6:max_batch_size}",
)
parser.add_argument("--gpu_num", type=int, default=1, help="nums of training gpus")
args = parser.parse_args()
args.separator = None if args.separator == "None" else args.separator
return args
def _is_number(num):
pattern = re.compile(r"^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$")
result = pattern.match(num)
if result:
return True
else:
return False
class TimeAnalyzer(object):
def __init__(
self, filename, keyword=None, separator=None, position=None, range="-1"
):
if filename is None:
raise Exception("Please specify the filename!")
if keyword is None:
raise Exception("Please specify the keyword!")
self.filename = filename
self.keyword = keyword
self.separator = separator
self.position = position
self.range = range
self.records = None
self._distil()
def _distil(self):
self.records = []
with open(self.filename, "r") as f_object:
lines = f_object.readlines()
for line in lines:
if self.keyword not in line:
continue
try:
result = None
# Distil the string from a line.
line = line.strip()
line_words = (
line.split(self.separator) if self.separator else line.split()
)
if args.position:
result = line_words[self.position]
else:
# Distil the string following the keyword.
for i in range(len(line_words) - 1):
if line_words[i] == self.keyword:
result = line_words[i + 1]
break
# Distil the result from the picked string.
if not self.range:
result = result[0:]
elif _is_number(self.range):
result = result[0 : int(self.range)]
else:
result = result[
int(self.range.split(":")[0]) : int(
self.range.split(":")[1]
)
]
self.records.append(float(result))
except Exception as exc:
print(
"line is: {}; separator={}; position={}".format(
line, self.separator, self.position
)
)
print(
"Extract {} records: separator={}; position={}".format(
len(self.records), self.separator, self.position
)
)
def _get_fps(self, mode, batch_size, gpu_num, avg_of_records, run_mode, unit=None):
if mode == -1 and run_mode == "sp":
assert unit, "Please set the unit when mode is -1."
fps = gpu_num * avg_of_records
elif mode == -1 and run_mode == "mp":
assert unit, "Please set the unit when mode is -1."
fps = gpu_num * avg_of_records # temporarily, not used now
print("------------this is mp")
elif mode == 0:
# s/step -> samples/s
fps = (batch_size * gpu_num) / avg_of_records
unit = "samples/s"
elif mode == 1:
# steps/s -> steps/s
fps = avg_of_records
unit = "steps/s"
elif mode == 2:
# s/step -> steps/s
fps = 1 / avg_of_records
unit = "steps/s"
elif mode == 3:
# steps/s -> samples/s
fps = batch_size * gpu_num * avg_of_records
unit = "samples/s"
elif mode == 4:
# s/epoch -> s/epoch
fps = avg_of_records
unit = "s/epoch"
else:
ValueError("Unsupported analysis mode.")
return fps, unit
def analysis(
self, batch_size, gpu_num=1, skip_steps=0, mode=-1, run_mode="sp", unit=None
):
if batch_size <= 0:
print("base_batch_size should larger than 0.")
return 0, ""
if (
len(self.records) <= skip_steps
): # to address the condition which item of log equals to skip_steps
print("no records")
return 0, ""
sum_of_records = 0
sum_of_records_skipped = 0
skip_min = self.records[skip_steps]
skip_max = self.records[skip_steps]
count = len(self.records)
for i in range(count):
sum_of_records += self.records[i]
if i >= skip_steps:
sum_of_records_skipped += self.records[i]
if self.records[i] < skip_min:
skip_min = self.records[i]
if self.records[i] > skip_max:
skip_max = self.records[i]
avg_of_records = sum_of_records / float(count)
avg_of_records_skipped = sum_of_records_skipped / float(count - skip_steps)
fps, fps_unit = self._get_fps(
mode, batch_size, gpu_num, avg_of_records, run_mode, unit
)
fps_skipped, _ = self._get_fps(
mode, batch_size, gpu_num, avg_of_records_skipped, run_mode, unit
)
if mode == -1:
print("average ips of %d steps, skip 0 step:" % count)
print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
print("\tFPS: %.3f %s" % (fps, fps_unit))
if skip_steps > 0:
print("average ips of %d steps, skip %d steps:" % (count, skip_steps))
print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
print("\tMin: %.3f %s" % (skip_min, fps_unit))
print("\tMax: %.3f %s" % (skip_max, fps_unit))
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
elif mode == 1 or mode == 3:
print("average latency of %d steps, skip 0 step:" % count)
print("\tAvg: %.3f steps/s" % avg_of_records)
print("\tFPS: %.3f %s" % (fps, fps_unit))
if skip_steps > 0:
print(
"average latency of %d steps, skip %d steps:" % (count, skip_steps)
)
print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
print("\tMin: %.3f steps/s" % skip_min)
print("\tMax: %.3f steps/s" % skip_max)
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
elif mode == 0 or mode == 2:
print("average latency of %d steps, skip 0 step:" % count)
print("\tAvg: %.3f s/step" % avg_of_records)
print("\tFPS: %.3f %s" % (fps, fps_unit))
if skip_steps > 0:
print(
"average latency of %d steps, skip %d steps:" % (count, skip_steps)
)
print("\tAvg: %.3f s/step" % avg_of_records_skipped)
print("\tMin: %.3f s/step" % skip_min)
print("\tMax: %.3f s/step" % skip_max)
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
return round(fps_skipped, 3), fps_unit
if __name__ == "__main__":
args = parse_args()
run_info = dict()
run_info["log_file"] = args.filename
run_info["model_name"] = args.model_name
run_info["mission_name"] = args.mission_name
run_info["direction_id"] = args.direction_id
run_info["run_mode"] = args.run_mode
run_info["index"] = args.index
run_info["gpu_num"] = args.gpu_num
run_info["FINAL_RESULT"] = 0
run_info["JOB_FAIL_FLAG"] = 0
try:
if args.index == 1:
if args.gpu_num == 1:
run_info["log_with_profiler"] = args.log_with_profiler
run_info["profiler_path"] = args.profiler_path
analyzer = TimeAnalyzer(
args.filename, args.keyword, args.separator, args.position, args.range
)
run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
batch_size=args.base_batch_size,
gpu_num=args.gpu_num,
skip_steps=args.skip_steps,
mode=args.model_mode,
run_mode=args.run_mode,
unit=args.ips_unit,
)
try:
if (
int(os.getenv("job_fail_flag")) == 1
or int(run_info["FINAL_RESULT"]) == 0
):
run_info["JOB_FAIL_FLAG"] = 1
except:
pass
elif args.index == 3:
run_info["FINAL_RESULT"] = {}
records_fo_total = TimeAnalyzer(
args.filename, "Framework overhead", None, 3, ""
).records
records_fo_ratio = TimeAnalyzer(
args.filename, "Framework overhead", None, 5
).records
records_ct_total = TimeAnalyzer(
args.filename, "Computation time", None, 3, ""
).records
records_gm_total = TimeAnalyzer(
args.filename, "GpuMemcpy Calls", None, 4, ""
).records
records_gm_ratio = TimeAnalyzer(
args.filename, "GpuMemcpy Calls", None, 6
).records
records_gmas_total = TimeAnalyzer(
args.filename, "GpuMemcpyAsync Calls", None, 4, ""
).records
records_gms_total = TimeAnalyzer(
args.filename, "GpuMemcpySync Calls", None, 4, ""
).records
run_info["FINAL_RESULT"]["Framework_Total"] = (
records_fo_total[0] if records_fo_total else 0
)
run_info["FINAL_RESULT"]["Framework_Ratio"] = (
records_fo_ratio[0] if records_fo_ratio else 0
)
run_info["FINAL_RESULT"]["ComputationTime_Total"] = (
records_ct_total[0] if records_ct_total else 0
)
run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = (
records_gm_total[0] if records_gm_total else 0
)
run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = (
records_gm_ratio[0] if records_gm_ratio else 0
)
run_info["FINAL_RESULT"]["GpuMemcpyAsync_Total"] = (
records_gmas_total[0] if records_gmas_total else 0
)
run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = (
records_gms_total[0] if records_gms_total else 0
)
else:
print("Not support!")
except Exception:
traceback.print_exc()
print(
"{}".format(json.dumps(run_info))
) # it's required, for the log file path insert to the database

View File

@ -0,0 +1,30 @@
# PaddleOCR DB/EAST/PSE 算法训练benchmark测试
PaddleOCR/benchmark目录下的文件用于获取并分析训练日志。
训练采用icdar2015数据集包括1000张训练图像和500张测试图像。模型配置采用resnet18_vd作为backbone分别训练batch_size=8和batch_size=16的情况。
## 运行训练benchmark
benchmark/run_det.sh 中包含了三个过程:
- 安装依赖
- 下载数据
- 执行训练
- 日志分析获取IPS
在执行训练部分会执行单机单卡默认0号卡单机多卡训练并分别执行batch_size=8和batch_size=16的情况。所以执行完后每种模型会得到4个日志文件。
run_det.sh 执行方式如下:
```
# cd PaddleOCR/
bash benchmark/run_det.sh
```
以DB为例将得到四个日志文件如下
```
det_res18_db_v2.0_sp_bs16_fp32_1
det_res18_db_v2.0_sp_bs8_fp32_1
det_res18_db_v2.0_mp_bs16_fp32_1
det_res18_db_v2.0_mp_bs8_fp32_1
```

View File

@ -0,0 +1,61 @@
#!/usr/bin/env bash
# 运行示例CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
# 参数说明
function _set_params(){
run_mode=${1:-"sp"} # 单卡sp|多卡mp
batch_size=${2:-"64"}
fp_item=${3:-"fp32"} # fp32|fp16
max_epoch=${4:-"10"} # 可选,如果需要修改代码提前中断
model_item=${5:-"model_item"}
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
# 日志解析所需参数
base_batch_size=${batch_size}
mission_name="OCR"
direction_id="0"
ips_unit="images/sec"
skip_steps=2 # 解析日志有些模型前几个step耗时长需要跳过 (必填)
keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填)
index="1"
model_name=${model_item}_bs${batch_size}_${fp_item} # model_item 用于yml文件名匹配model_name 用于数据入库前端展示
# 以下不用修改
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
}
function _train(){
echo "Train on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
train_cmd="-c configs/det/${model_item}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_epoch} Global.eval_batch_step=[0,20000] Global.print_batch_step=2"
case ${run_mode} in
sp)
train_cmd="python tools/train.py "${train_cmd}""
;;
mp)
rm -rf ./mylog
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}"
;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac
# 以下不用修改
echo ${train_cmd}
timeout 15m ${train_cmd} > ${log_file} 2>&1
if [ $? -ne 0 ];then
echo -e "${model_name}, FAIL"
export job_fail_flag=1
else
echo -e "${model_name}, SUCCESS"
export job_fail_flag=0
fi
if [ $run_mode = "mp" -a -d mylog ]; then
rm ${log_file}
cp mylog/workerlog.0 ${log_file}
fi
}
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
_set_params $@
#_train # 如果只想产出训练log,不解析,可取消注释
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开

View File

@ -0,0 +1,36 @@
#!/bin/bash
# 提供可稳定复现性能的脚本默认在标准docker环境内py37执行 paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37
# 执行目录: ./PaddleOCR
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
python -m pip install -r requirements.txt
# 2 拷贝该模型需要数据、预训练模型
wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
# 3 批量运行如不方便批量12需放到单个模型中
model_mode_list=(det_res18_db_v2.0 det_r50_vd_east det_r50_vd_pse)
fp_item_list=(fp32)
for model_mode in ${model_mode_list[@]}; do
for fp_item in ${fp_item_list[@]}; do
if [ ${model_mode} == "det_r50_vd_east" ]; then
bs_list=(16)
else
bs_list=(8 16)
fi
for bs_item in ${bs_list[@]}; do
echo "index is speed, 1gpus, begin, ${model_name}"
run_mode=sp
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 1 ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min)
sleep 60
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
run_mode=mp
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
sleep 60
done
done
done

View File

@ -0,0 +1,98 @@
Global:
debug: false
use_gpu: true
epoch_num: 100
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec_ppocr_v3_rotnet
save_epoch_step: 3
eval_batch_step: [0, 2000]
cal_metric_during_train: true
pretrained_model: null
checkpoints: null
save_inference_dir: null
use_visualdl: false
infer_img: doc/imgs_words/ch/word_1.jpg
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
max_text_length: 25
infer_mode: false
use_space_char: true
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
Optimizer:
name: Adam
beta1: 0.9
beta2: 0.999
lr:
name: Cosine
learning_rate: 0.001
regularizer:
name: L2
factor: 1.0e-05
Architecture:
model_type: cls
algorithm: CLS
Transform: null
Backbone:
name: MobileNetV1Enhance
scale: 0.5
last_conv_stride: [1, 2]
last_pool_type: avg
Neck:
Head:
name: ClsHead
class_dim: 4
Loss:
name: ClsLoss
main_indicator: acc
PostProcess:
name: ClsPostProcess
Metric:
name: ClsMetric
main_indicator: acc
Train:
dataset:
name: SimpleDataSet
data_dir: ./train_data
label_file_list:
- ./train_data/train_list.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- BaseDataAugmentation:
- RandAugment:
- SSLRotateResize:
image_shape: [3, 48, 320]
- KeepKeys:
keep_keys: ["image", "label"]
loader:
collate_fn: "SSLRotateCollate"
shuffle: true
batch_size_per_card: 32
drop_last: true
num_workers: 8
Eval:
dataset:
name: SimpleDataSet
data_dir: ./train_data
label_file_list:
- ./train_data/val_list.txt
transforms:
- DecodeImage:
img_mode: BGR
channel_first: false
- SSLRotateResize:
image_shape: [3, 48, 320]
- KeepKeys:
keep_keys: ["image", "label"]
loader:
collate_fn: "SSLRotateCollate"
shuffle: false
drop_last: false
batch_size_per_card: 64
num_workers: 8
profiler_options: null

Some files were not shown because too many files have changed in this diff Show More