添加PaddleOCr项目
This commit is contained in:
parent
b6bfe484be
commit
cb8384c5bb
34
PaddleOCR-3.1.0/.gitignore
vendored
Normal file
34
PaddleOCR-3.1.0/.gitignore
vendored
Normal file
@ -0,0 +1,34 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
.ipynb_checkpoints/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
inference/
|
||||
inference_results/
|
||||
output/
|
||||
train_data/
|
||||
log/
|
||||
*.DS_Store
|
||||
*.vs
|
||||
*.user
|
||||
*~
|
||||
*.vscode
|
||||
*.idea
|
||||
|
||||
*.log
|
||||
.clang-format
|
||||
.clang_format.hook
|
||||
|
||||
build/
|
||||
dist/
|
||||
paddleocr.egg-info/
|
||||
/deploy/android_demo/app/OpenCV/
|
||||
/deploy/android_demo/app/PaddleLite/
|
||||
/deploy/android_demo/app/.cxx/
|
||||
/deploy/android_demo/app/cache/
|
||||
test_tipc/web/models/
|
||||
test_tipc/web/node_modules/
|
||||
45
PaddleOCR-3.1.0/.pre-commit-config.yaml
Normal file
45
PaddleOCR-3.1.0/.pre-commit-config.yaml
Normal file
@ -0,0 +1,45 @@
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: check-added-large-files
|
||||
args: ['--maxkb=512']
|
||||
- id: check-case-conflict
|
||||
- id: check-merge-conflict
|
||||
- id: check-symlinks
|
||||
- id: detect-private-key
|
||||
- id: end-of-file-fixer
|
||||
- id: trailing-whitespace
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
|
||||
- repo: https://github.com/Lucas-C/pre-commit-hooks
|
||||
rev: v1.5.5
|
||||
hooks:
|
||||
- id: remove-crlf
|
||||
- id: remove-tabs
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|py)$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: clang-format
|
||||
name: clang-format
|
||||
description: Format files with ClangFormat
|
||||
entry: bash .clang_format.hook -i
|
||||
language: system
|
||||
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
|
||||
# For Python files
|
||||
- repo: https://github.com/psf/black.git
|
||||
rev: 24.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
|
||||
|
||||
# Flake8
|
||||
- repo: https://github.com/pycqa/flake8
|
||||
rev: 7.1.1
|
||||
hooks:
|
||||
- id: flake8
|
||||
args:
|
||||
- --count
|
||||
- --select=E9,F63,F7,F82,E721
|
||||
- --show-source
|
||||
- --statistics
|
||||
exclude: ^benchmark/|^test_tipc/
|
||||
3
PaddleOCR-3.1.0/.style.yapf
Normal file
3
PaddleOCR-3.1.0/.style.yapf
Normal file
@ -0,0 +1,3 @@
|
||||
[style]
|
||||
based_on_style = pep8
|
||||
column_limit = 80
|
||||
201
PaddleOCR-3.1.0/LICENSE
Normal file
201
PaddleOCR-3.1.0/LICENSE
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
22
PaddleOCR-3.1.0/MANIFEST.in
Normal file
22
PaddleOCR-3.1.0/MANIFEST.in
Normal file
@ -0,0 +1,22 @@
|
||||
prune .github
|
||||
prune applications
|
||||
prune benchmark
|
||||
prune configs
|
||||
prune deploy
|
||||
prune doc
|
||||
prune docs
|
||||
prune overrides
|
||||
prune ppocr/ext_op
|
||||
prune ppocr/losses
|
||||
prune ppocr/metrics
|
||||
prune ppocr/modeling
|
||||
prune ppocr/optimizer
|
||||
prune ppstructure/docs
|
||||
prune test_tipc
|
||||
prune tests
|
||||
exclude .clang_format.hook
|
||||
exclude .gitignore
|
||||
exclude .pre-commit-config.yaml
|
||||
exclude .style.yapf
|
||||
exclude mkdocs.yml
|
||||
exclude train.sh
|
||||
353
PaddleOCR-3.1.0/README.md
Normal file
353
PaddleOCR-3.1.0/README.md
Normal file
@ -0,0 +1,353 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR Banner">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
English | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 Introduction
|
||||
Since its initial release, PaddleOCR has gained widespread acclaim across academia, industry, and research communities, thanks to its cutting-edge algorithms and proven performance in real-world applications. It's already powering popular open-source projects like Umi-OCR, OmniParser, MinerU, and RAGFlow, making it the go-to OCR toolkit for developers worldwide.
|
||||
|
||||
On May 20, 2025, the PaddlePaddle team unveiled PaddleOCR 3.0, fully compatible with the official release of the **PaddlePaddle 3.0** framework. This update further **boosts text-recognition accuracy**, adds support for **multiple text-type recognition** and **handwriting recognition**, and meets the growing demand from large-model applications for **high-precision parsing of complex documents**. When combined with the **ERNIE 4.5 Turbo**, it significantly enhances key-information extraction accuracy. PaddleOCR 3.0 also introduces support for Chinese Heterogeneous AI Accelerators such as **KUNLUNXIN** and **Ascend**. For the complete usage documentation, please refer to the [PaddleOCR 3.0 Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
|
||||
|
||||
Three Major New Features in PaddleOCR 3.0:
|
||||
- Universal-Scene Text Recognition Model [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): A single model that handles five different text types plus complex handwriting. Overall recognition accuracy has increased by 13 percentage points over the previous generation. [Online Demo](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
|
||||
- General Document-Parsing Solution [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Delivers high-precision parsing of multi-layout, multi-scene PDFs, outperforming many open- and closed-source solutions on public benchmarks. [Online Demo](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
|
||||
- Intelligent Document-Understanding Solution [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Natively powered by the ERNIE 4.5 Turbo, achieving 15 percentage points higher accuracy than its predecessor. [Online Demo](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
In addition to providing an outstanding model library, PaddleOCR 3.0 also offers user-friendly tools covering model training, inference, and service deployment, so developers can rapidly bring AI applications to production.
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR Architecture">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
## 📣 Recent updates
|
||||
|
||||
|
||||
#### **2025.06.29: Release of PaddleOCR 3.1.0**, includes:
|
||||
|
||||
- **Key Models and Pipelines:**
|
||||
- **Added PP-OCRv5 Multilingual Text Recognition Model**, which supports the training and inference process for text recognition models in 37 languages, including French, Spanish, Portuguese, Russian, Korean, etc. **Average accuracy improved by over 30%.** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- Upgraded the **PP-Chart2Table model** in PP-StructureV3, further enhancing the capability of converting charts to tables. On internal custom evaluation sets, the metric (RMS-F1) **increased by 9.36 percentage points (71.24% -> 80.60%).**
|
||||
- Newly launched **document translation pipeline, PP-DocTranslation, based on PP-StructureV3 and ERNIE 4.5 Turbo**, which supports the translation of Markdown format documents, various complex-layout PDF documents, and document images, with the results saved as Markdown format documents. [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
|
||||
- **New MCP server:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
|
||||
- **Supports both OCR and PP-StructureV3 pipelines.**
|
||||
- Supports three working modes: local Python library, AIStudio Community Cloud Service, and self-hosted service.
|
||||
- Supports invoking local services via stdio and remote services via Streamable HTTP.
|
||||
|
||||
- **Documentation Optimization:** Improved the descriptions in some user guides for a smoother reading experience.
|
||||
|
||||
#### **2025.06.26: Release of PaddleOCR 3.0.3**, includes:
|
||||
|
||||
- Bug Fix: Resolved the issue where the `enable_mkldnn` parameter was not effective, restoring the default behavior of using MKL-DNN for CPU inference.
|
||||
|
||||
#### 🔥🔥 **2025.06.19: Release of PaddleOCR 3.0.2**, includes:
|
||||
|
||||
- **New Features:**
|
||||
|
||||
- The default download source has been changed from `BOS` to `HuggingFace`. Users can also change the environment variable `PADDLE_PDX_MODEL_SOURCE` to `BOS` to set the model download source back to Baidu Object Storage (BOS).
|
||||
- Added service invocation examples for six languages—C++, Java, Go, C#, Node.js, and PHP—for pipelines like PP-OCRv5, PP-StructureV3, and PP-ChatOCRv4.
|
||||
- Improved the layout partition sorting algorithm in the PP-StructureV3 pipeline, enhancing the sorting logic for complex vertical layouts to deliver better results.
|
||||
- Enhanced model selection logic: when a language is specified but a model version is not, the system will automatically select the latest model version supporting that language.
|
||||
- Set a default upper limit for MKL-DNN cache size to prevent unlimited growth, while also allowing users to configure cache capacity.
|
||||
- Updated default configurations for high-performance inference to support Paddle MKL-DNN acceleration and optimized the logic for automatic configuration selection for smarter choices.
|
||||
- Adjusted the logic for obtaining the default device to consider the actual support for computing devices by the installed Paddle framework, making program behavior more intuitive.
|
||||
- Added Android example for PP-OCRv5. [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
|
||||
|
||||
- **Bug Fixes:**
|
||||
|
||||
- Fixed an issue with some CLI parameters in PP-StructureV3 not taking effect.
|
||||
- Resolved an issue where `export_paddlex_config_to_yaml` would not function correctly in certain cases.
|
||||
- Corrected the discrepancy between the actual behavior of `save_path` and its documentation description.
|
||||
- Fixed potential multithreading errors when using MKL-DNN in basic service deployment.
|
||||
- Corrected channel order errors in image preprocessing for the Latex-OCR model.
|
||||
- Fixed channel order errors in saving visualized images within the text recognition module.
|
||||
- Resolved channel order errors in visualized table results within PP-StructureV3 pipeline.
|
||||
- Fixed an overflow issue in the calculation of `overlap_ratio` under extremely special circumstances in the PP-StructureV3 pipeline.
|
||||
|
||||
- **Documentation Improvements:**
|
||||
|
||||
- Updated the description of the `enable_mkldnn` parameter in the documentation to accurately reflect the program's actual behavior.
|
||||
- Fixed errors in the documentation regarding the `lang` and `ocr_version` parameters.
|
||||
- Added instructions for exporting production line configuration files via CLI.
|
||||
- Fixed missing columns in the performance data table for PP-OCRv5.
|
||||
- Refined benchmark metrics for PP-StructureV3 across different configurations.
|
||||
|
||||
- **Others:**
|
||||
|
||||
- Relaxed version restrictions on dependencies like numpy and pandas, restoring support for Python 3.12.
|
||||
|
||||
<details>
|
||||
<summary><strong>History Log</strong></summary>
|
||||
|
||||
#### **🔥🔥 2025.06.05: Release of PaddleOCR 3.0.1, includes:**
|
||||
|
||||
- **Optimisation of certain models and model configurations:**
|
||||
- Updated the default model configuration for PP-OCRv5, changing both detection and recognition from mobile to server models. To improve default performance in most scenarios, the parameter `limit_side_len` in the configuration has been changed from 736 to 64.
|
||||
- Added a new text line orientation classification model `PP-LCNet_x1_0_textline_ori` with an accuracy of 99.42%. The default text line orientation classifier for OCR, PP-StructureV3, and PP-ChatOCRv4 pipelines has been updated to this model.
|
||||
- Optimised the text line orientation classification model `PP-LCNet_x0_25_textline_ori`, improving accuracy by 3.3 percentage points to a current accuracy of 98.85%.
|
||||
|
||||
- **Optimizations and fixes for some issues in version 3.0.0, [details](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
|
||||
|
||||
🔥🔥2025.05.20: Official Release of **PaddleOCR v3.0**, including:
|
||||
- **PP-OCRv5**: High-Accuracy Text Recognition Model for All Scenarios - Instant Text from Images/PDFs.
|
||||
1. 🌐 Single-model support for **five** text types - Seamlessly process **Simplified Chinese, Traditional Chinese, Simplified Chinese Pinyin, English** and **Japanese** within a single model.
|
||||
2. ✍️ Improved **handwriting recognition**: Significantly better at complex cursive scripts and non-standard handwriting.
|
||||
3. 🎯 **13-point accuracy gain** over PP-OCRv4, achieving state-of-the-art performance across a variety of real-world scenarios.
|
||||
|
||||
- **PP-StructureV3**: General-Purpose Document Parsing – Unleash SOTA Images/PDFs Parsing for Real-World Scenarios!
|
||||
1. 🧮 **High-Accuracy multi-scene PDF parsing**, leading both open- and closed-source solutions on the OmniDocBench benchmark.
|
||||
2. 🧠 Specialized capabilities include **seal recognition**, **chart-to-table conversion**, **table recognition with nested formulas/images**, **vertical text document parsing**, and **complex table structure analysis**.
|
||||
|
||||
- **PP-ChatOCRv4**: Intelligent Document Understanding – Extract Key Information, not just text from Images/PDFs.
|
||||
1. 🔥 **15-point accuracy gain** in key-information extraction on PDF/PNG/JPG files over the previous generation.
|
||||
2. 💻 Native support for **ERNIE 4.5 Turbo**, with compatibility for large-model deployments via PaddleNLP, Ollama, vLLM, and more.
|
||||
3. 🤝 Integrated [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), enabling extraction and understanding of printed text, handwriting, seals, tables, charts, and other common elements in complex documents.
|
||||
|
||||
[History Log](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ Quick Start
|
||||
### 1. Run online demo
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. Installation
|
||||
|
||||
Install PaddlePaddle refer to [Installation Guide](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), after then, install the PaddleOCR toolkit.
|
||||
|
||||
```bash
|
||||
# Install paddleocr
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. Run inference by CLI
|
||||
```bash
|
||||
# Run PP-OCRv5 inference
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# Run PP-StructureV3 inference
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Get the Qianfan API Key at first, and then run PP-ChatOCRv4 inference
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Get more information about "paddleocr ocr"
|
||||
paddleocr ocr --help
|
||||
```
|
||||
|
||||
### 4. Run inference by API
|
||||
**4.1 PP-OCRv5 Example**
|
||||
```python
|
||||
# Initialize PaddleOCR instance
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# Run OCR inference on a sample image
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# Visualize the results and save the JSON results
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 PP-StructureV3 Example</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# For Image
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# Visualize the results and save the JSON results
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 PP-ChatOCRv4 Example</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# If a multimodal large model is used, the local mllm service needs to be started. You can refer to the documentation: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md performs deployment and updates the mllm_chat_bot_config configuration.
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### 5. Chinese Heterogeneous AI Accelerators
|
||||
- [Huawei Ascend](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
|
||||
- [KUNLUNXIN](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
|
||||
|
||||
## ⛰️ Advanced Tutorials
|
||||
- [PP-OCRv5 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [PP-StructureV3 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [PP-ChatOCRv4 Tutorial](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 Quick Overview of Execution Results
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 Community
|
||||
|
||||
| PaddlePaddle WeChat official account | Join the tech discussion group |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
|
||||
## 😃 Awesome Projects Leveraging PaddleOCR
|
||||
PaddleOCR wouldn't be where it is today without its incredible community! 💗 A massive thank you to all our longtime partners, new collaborators, and everyone who's poured their passion into PaddleOCR — whether we've named you or not. Your support fuels our fire!
|
||||
|
||||
| Project Name | Description |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|RAG engine based on deep document understanding.|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Multi-type Document to Markdown Conversion Tool|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Free, Open-source, Batch Offline OCR Software.|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Screen Parsing tool for Pure Vision Based GUI Agent.|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Question and Answer based on Anything.|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|A powerful open-source toolkit designed to efficiently extract high-quality content from complex and diverse PDF documents.|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |Recognize text on the screen, translate it and show the translation results in real time.|
|
||||
| [Learn more projects](./awesome_projects.md) | [More projects based on PaddleOCR](./awesome_projects.md)|
|
||||
|
||||
## 👩👩👧👦 Contributors
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 License
|
||||
This project is released under the [Apache 2.0 license](LICENSE).
|
||||
|
||||
## 🎓 Citation
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
413
PaddleOCR-3.1.0/README_ar.md
Normal file
413
PaddleOCR-3.1.0/README_ar.md
Normal file
@ -0,0 +1,413 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR Banner">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
<p>
|
||||
|
||||
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | العربية
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div dir="rtl">
|
||||
|
||||
## 🚀 مقدمة
|
||||
منذ إصداره الأولي، حظي PaddleOCR بتقدير واسع النطاق في الأوساط الأكاديمية والصناعية والبحثية، بفضل خوارزمياته المتطورة وأدائه المثبت في تطبيقات العالم الحقيقي. وهو يدعم بالفعل مشاريع مفتوحة المصدر شهيرة مثل Umi-OCR، و OmniParser، و MinerU، و RAGFlow، مما يجعله مجموعة أدوات التعرف الضوئي على الحروف المفضلة للمطورين في جميع أنحاء العالم.
|
||||
|
||||
في 20 مايو 2025، كشف فريق PaddlePaddle عن PaddleOCR 3.0، المتوافق تمامًا مع الإصدار الرسمي لإطار العمل **PaddlePaddle 3.0**. يعزز هذا التحديث **دقة التعرف على النصوص**، ويضيف دعمًا لـ **التعرف على أنواع نصوص متعددة** و **التعرف على الكتابة اليدوية**، ويلبي الطلب المتزايد من التطبيقات القائمة على النماذج الكبيرة على **التحليل عالي الدقة للمستندات المعقدة**. عند دمجه مع **ERNIE 4.5 Turbo**، فإنه يعزز بشكل كبير دقة استخراج المعلومات الرئيسية. كما يقدم PaddleOCR 3.0 دعمًا لمسرعات الذكاء الاصطناعي الصينية غير المتجانسة مثل **KUNLUNXIN** و **Ascend**. للحصول على وثائق الاستخدام الكاملة، يرجى الرجوع إلى [وثائق PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
|
||||
|
||||
##### ثلاث ميزات رئيسية جديدة في PaddleOCR 3.0:
|
||||
نموذج التعرف على النصوص في جميع السيناريوهات [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): نموذج واحد يعالج خمسة أنواع مختلفة من النصوص بالإضافة إلى الكتابة اليدوية المعقدة. زادت دقة التعرف الإجمالية بمقدار 13 نقطة مئوية عن الجيل السابق. [تجربة مباشرة](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
|
||||
حل تحليل المستندات العام [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): يقدم تحليلًا عالي الدقة لملفات PDF متعددة التخطيطات والسيناريوهات، متفوقًا على العديد من الحلول المفتوحة والمغلقة المصدر في المعايير العامة. [تجربة مباشرة](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
|
||||
حل فهم المستندات الذكي [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): مدعوم أصلاً بنموذج **ERNIE 4.5 Turbo**، ويحقق دقة أعلى بنسبة 15 نقطة مئوية من سابقه. [تجربة مباشرة](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
بالإضافة إلى توفير مكتبة نماذج متميزة، يقدم PaddleOCR 3.0 أيضًا أدوات سهلة الاستخدام تغطي تدريب النماذج والاستدلال ونشر الخدمات، حتى يتمكن المطورون من إدخال تطبيقات الذكاء الاصطناعي إلى الإنتاج بسرعة.
|
||||
|
||||
<p align="center">
|
||||
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR Architecture">
|
||||
</p>
|
||||
|
||||
|
||||
|
||||
|
||||
## 📣 آخر التحديثات
|
||||
|
||||
|
||||
<h4 dir="rtl">🔥🔥<strong>2025.06.29: إصدار <bdi dir="ltr">PaddleOCR 3.1.0</bdi>، يتضمن:</strong></h4>
|
||||
<ul dir="rtl">
|
||||
<li><strong>النماذج وخطوط الأنابيب الرئيسية:</strong>
|
||||
<ul dir="rtl">
|
||||
<li>
|
||||
<strong>تمت إضافة نموذج التعرف على النصوص متعدد اللغات <bdi dir="ltr">PP-OCRv5</bdi></strong>، والذي يدعم تدريب واستدلال نماذج التعرف على النصوص في 37 لغة، بما في ذلك الفرنسية، الإسبانية، البرتغالية، الروسية، الكورية وغيرها. <strong>تحسنت الدقة المتوسطة بنسبة تزيد عن 30%.</strong>
|
||||
<a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html">التفاصيل</a>
|
||||
</li>
|
||||
<li>
|
||||
تم ترقية نموذج <bdi dir="ltr">PP-Chart2Table</bdi> في <bdi dir="ltr">PP-StructureV3</bdi>، مما عزز أكثر من إمكانية تحويل المخططات إلى جداول. في مجموعات التقييم الداخلية، ارتفع المقياس (<bdi dir="ltr">RMS-F1</bdi>) بمقدار <strong>9.36 نقطة مئوية (71.24% → 80.60%)</strong>.
|
||||
</li>
|
||||
<li>
|
||||
تم إطلاق خط أنابيب ترجمة المستندات الجديد <bdi dir="ltr">PP-DocTranslation</bdi>، المبني على <bdi dir="ltr">PP-StructureV3</bdi> و <bdi dir="ltr">ERNIE 4.5 Turbo</bdi>، ويدعم ترجمة مستندات <bdi dir="ltr">Markdown</bdi>، ومستندات <bdi dir="ltr">PDF</bdi> ذات التنسيقات المعقدة وصور المستندات، مع حفظ النتائج كمستندات <bdi dir="ltr">Markdown</bdi>.
|
||||
<a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html">التفاصيل</a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><strong>دعم MCP:</strong><a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html">التفاصيل</a>
|
||||
<ul dir="rtl">
|
||||
<li>
|
||||
<strong>يدعم خطوط أنابيب OCR و PP-StructureV3.</strong>
|
||||
</li>
|
||||
<li>
|
||||
يدعم ثلاثة أوضاع عمل: مكتبة Python المحلية، خدمة السحابة المجتمعية AIStudio، وخدمة الاستضافة الذاتية.
|
||||
</li>
|
||||
<li>
|
||||
يدعم استدعاء الخدمات المحلية عبر stdio والخدمات البعيدة عبر Streamable HTTP.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><strong>تحسين الوثائق:</strong>
|
||||
<ul dir="rtl">
|
||||
<li>تم تحسين الشروحات في بعض الأدلة للمستخدمين لتوفير تجربة قراءة أكثر سلاسة.</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h4 dir="rtl">🔥🔥<strong>2025.06.26: إصدار <bdi dir="ltr">PaddleOCR 3.0.3</bdi>، يتضمن:</strong></h4>
|
||||
<ul dir="rtl">
|
||||
<li> تصحيح خلل: تم حل المشكلة التي لم يكن فيها معلمة <code>enable_mkldnn</code> فعّالة، واستعادة السلوك الافتراضي باستخدام MKL-DNN للاستدلال بوحدة المعالجة المركزية.</li>
|
||||
</ul>
|
||||
|
||||
<h4 dir="rtl">🔥🔥<strong>2025.06.19: إصدار <bdi dir="ltr">PaddleOCR 3.0.2</bdi>، يتضمن:</strong></h4>
|
||||
<ul dir="rtl">
|
||||
<li><strong>ميزات جديدة:</strong>
|
||||
<ul dir="rtl">
|
||||
<li>تم تغيير مصدر التنزيل الافتراضي من <bdi dir="ltr"><code>BOS</code></bdi> إلى <bdi dir="ltr"><code>HuggingFace</code></bdi>. يمكن للمستخدمين أيضًا تغيير متغير البيئة <bdi dir="ltr"><code>PADDLE_PDX_MODEL_SOURCE</code></bdi> إلى <bdi dir="ltr"><code>BOS</code></bdi> لإعادة تعيين مصدر تنزيل النموذج إلى <bdi dir="ltr">Baidu Object Storage (BOS)</bdi>.</li>
|
||||
<li>تمت إضافة أمثلة استدعاء الخدمة لست لغات — <bdi dir="ltr">C++</bdi>, <bdi dir="ltr">Java</bdi>, <bdi dir="ltr">Go</bdi>, <bdi dir="ltr">C#</bdi>, <bdi dir="ltr">Node.js</bdi>, و <bdi dir="ltr">PHP</bdi> — لخطوط الأنابيب مثل <bdi dir="ltr">PP-OCRv5</bdi>, <bdi dir="ltr">PP-StructureV3</bdi>, و <bdi dir="ltr">PP-ChatOCRv4</bdi>.</li>
|
||||
<li>تحسين خوارزمية فرز تقسيم التخطيط في خط أنابيب <bdi dir="ltr">PP-StructureV3</bdi>، مما يعزز منطق الفرز للتخطيطات العمودية المعقدة لتقديم نتائج أفضل.</li>
|
||||
<li>تحسين منطق اختيار النموذج: عند تحديد لغة وعدم تحديد إصدار النموذج، سيقوم النظام تلقائيًا بتحديد أحدث إصدار للنموذج يدعم تلك اللغة.</li>
|
||||
<li>تعيين حد أعلى افتراضي لحجم ذاكرة التخزين المؤقت لـ <bdi dir="ltr">MKL-DNN</bdi> لمنع النمو غير المحدود، مع السماح للمستخدمين أيضًا بتكوين سعة ذاكرة التخزين المؤقت.</li>
|
||||
<li>تحديث التكوينات الافتراضية للاستدلال عالي الأداء لدعم تسريع <bdi dir="ltr">Paddle MKL-DNN</bdi> وتحسين منطق الاختيار التلقائي للتكوين لخيارات أكثر ذكاءً.</li>
|
||||
<li>تعديل منطق الحصول على الجهاز الافتراضي لمراعاة الدعم الفعلي لأجهزة الحوسبة بواسطة إطار عمل <bdi dir="ltr">Paddle</bdi> المثبت، مما يجعل سلوك البرنامج أكثر بديهية.</li>
|
||||
<li>إضافة مثال <bdi dir="ltr">Android</bdi> لـ <bdi dir="ltr">PP-OCRv5</bdi>. <a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html">التفاصيل</a>.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><strong>إصلاحات الأخطاء:</strong>
|
||||
<ul dir="rtl">
|
||||
<li>إصلاح مشكلة عدم تفعيل بعض معلمات <bdi dir="ltr">CLI</bdi> في <bdi dir="ltr">PP-StructureV3</bdi>.</li>
|
||||
<li>حل مشكلة حيث لا تعمل <bdi dir="ltr"><code>export_paddlex_config_to_yaml</code></bdi> بشكل صحيح في بعض الحالات.</li>
|
||||
<li>تصحيح التناقض بين السلوك الفعلي لـ <bdi dir="ltr"><code>save_path</code></bdi> ووصفه في الوثائق.</li>
|
||||
<li>إصلاح أخطاء تعدد الخيوط المحتملة عند استخدام <bdi dir="ltr">MKL-DNN</bdi> في نشر الخدمة الأساسية.</li>
|
||||
<li>تصحيح أخطاء ترتيب القنوات في المعالجة المسبقة للصور لنموذج <bdi dir="ltr">Latex-OCR</bdi>.</li>
|
||||
<li>إصلاح أخطاء ترتيب القنوات في حفظ الصور المرئية داخل وحدة التعرف على النص.</li>
|
||||
<li>حل أخطاء ترتيب القنوات في نتائج الجداول المرئية داخل خط أنابيب <bdi dir="ltr">PP-StructureV3</bdi>.</li>
|
||||
<li>إصلاح مشكلة تجاوز السعة في حساب <bdi dir="ltr"><code>overlap_ratio</code></bdi> في ظروف خاصة للغاية في خط أنابيب <bdi dir="ltr">PP-StructureV3</bdi>.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><strong>تحسينات على الوثائق:</strong>
|
||||
<ul dir="rtl">
|
||||
<li>تحديث وصف المعلمة <bdi dir="ltr"><code>enable_mkldnn</code></bdi> في الوثائق لتعكس بدقة السلوك الفعلي للبرنامج.</li>
|
||||
<li>إصلاح الأخطاء في الوثائق المتعلقة بمعلمات <bdi dir="ltr"><code>lang</code></bdi> و <bdi dir="ltr"><code>ocr_version</code></bdi>.</li>
|
||||
<li>إضافة تعليمات لتصدير ملفات تكوين خط الإنتاج عبر <bdi dir="ltr">CLI</bdi>.</li>
|
||||
<li>إصلاح الأعمدة المفقودة في جدول بيانات أداء <bdi dir="ltr">PP-OCRv5</bdi>.</li>
|
||||
<li>تحسين مقاييس الأداء لـ <bdi dir="ltr">PP-StructureV3</bdi> عبر تكوينات مختلفة.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><strong>أخرى:</strong>
|
||||
<ul dir="rtl">
|
||||
<li>تخفيف قيود الإصدار على التبعيات مثل <bdi dir="ltr">numpy</bdi> و <bdi dir="ltr">pandas</bdi>، واستعادة الدعم لـ <bdi dir="ltr">Python 3.12</bdi>.</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<details>
|
||||
<summary dir="rtl"><strong>سجل التحديثات</strong></summary>
|
||||
|
||||
<h4 dir="rtl"><strong>🔥🔥 2025.06.05: إصدار <bdi dir="ltr">PaddleOCR 3.0.1</bdi>، يتضمن:</strong></h4>
|
||||
<ul dir="rtl">
|
||||
<li><strong>تحسين بعض النماذج وتكويناتها:</strong>
|
||||
<ol dir="rtl">
|
||||
<li>تحديث تكوين النموذج الافتراضي لـ <bdi dir="ltr">PP-OCRv5</bdi>، وتغيير كل من الكشف والتعرف من <bdi dir="ltr"><code>mobile</code></bdi> إلى <bdi dir="ltr"><code>server</code></bdi>. لتحسين الأداء الافتراضي في معظم السيناريوهات، تم تغيير المعلمة <bdi dir="ltr"><code>limit_side_len</code></bdi> في التكوين من 736 إلى 64.</li>
|
||||
<li>إضافة نموذج جديد لتصنيف اتجاه أسطر النص <bdi dir="ltr"><code>PP-LCNet_x1_0_textline_ori</code></bdi> بدقة 99.42%. تم تحديث مصنف اتجاه أسطر النص الافتراضي لخطوط أنابيب <bdi dir="ltr">OCR</bdi> و <bdi dir="ltr">PP-StructureV3</bdi> و <bdi dir="ltr">PP-ChatOCRv4</bdi> إلى هذا النموذج.</li>
|
||||
<li>تحسين نموذج تصنيف اتجاه أسطر النص <bdi dir="ltr"><code>PP-LCNet_x0_25_textline_ori</code></bdi>، مما أدى إلى تحسين الدقة بمقدار 3.3 نقطة مئوية لتصل إلى الدقة الحالية البالغة 98.85%.</li>
|
||||
</ol>
|
||||
<li><strong>تحسينات وإصلاحات لبعض المشكلات في الإصدار 3.0.0، <a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html">التفاصيل</a></strong></li>
|
||||
</ul>
|
||||
|
||||
🔥🔥2025.05.20: الإصدار الرسمي لـ **PaddleOCR v3.0**، بما في ذلك:
|
||||
<h4 dir="rtl"><bdi dir="ltr">PP-OCRv5</bdi>: نموذج التعرف على النصوص عالي الدقة لجميع السيناريوهات – نص فوري من الصور/PDF.</h4>
|
||||
<ol dir="rtl">
|
||||
<li>🌐 دعم نموذج واحد **لخمسة** أنواع من النصوص - معالجة سلسة **للصينية المبسطة والصينية التقليدية وبينين الصينية المبسطة والإنجليزية** و**اليابانية** ضمن نموذج واحد.</li>
|
||||
<li>✍️ تحسين **التعرف على الكتابة اليدوية**: أداء أفضل بشكل ملحوظ في النصوص المتصلة المعقدة والكتابة اليدوية غير القياسية.</li>
|
||||
<li>🎯 **زيادة في الدقة بمقدار 13 نقطة** عن <bdi dir="ltr">PP-OCRv4</bdi>، مما يحقق أداءً على أحدث طراز في مجموعة متنوعة من سيناريوهات العالم الحقيقي.</li>
|
||||
</ol>
|
||||
|
||||
#### <h4 dir="rtl"><bdi dir="ltr">PP-StructureV3</bdi>: تحليل المستندات للأغراض العامة – أطلق العنان لتحليل الصور/PDFs بأحدث التقنيات لسيناريوهات العالم الحقيقي!</h4>
|
||||
<ol dir="rtl">
|
||||
<li>🧮 **تحليل PDF عالي الدقة متعدد السيناريوهات**، يتصدر كلاً من الحلول المفتوحة والمغلقة المصدر على معيار <bdi dir="ltr">OmniDocBench</bdi>.</li>
|
||||
<li>🧠 تشمل القدرات المتخصصة **التعرف على الأختام**، **تحويل المخططات إلى جداول**، **التعرف على الجداول التي تحتوي على صيغ/صور متداخلة**، **تحليل المستندات ذات النصوص العمودية**، و**تحليل هياكل الجداول المعقدة**.</li>
|
||||
</ol>
|
||||
|
||||
#### <h4 dir="rtl"><bdi dir="ltr">PP-ChatOCRv4</bdi>: فهم المستندات الذكي – استخرج المعلومات الأساسية، وليس فقط النصوص من الصور/PDFs.</h4>
|
||||
<ol dir="rtl">
|
||||
<li>🔥 **زيادة في الدقة بمقدار 15 نقطة** في استخراج المعلومات الأساسية من ملفات <bdi dir="ltr">PDF/PNG/JPG</bdi> مقارنة بالجيل السابق.</li>
|
||||
<li>💻 دعم أصلي لـ <bdi dir="ltr">ERNIE 4.5 Turbo</bdi>، مع التوافق مع عمليات نشر النماذج الكبيرة عبر <bdi dir="ltr">PaddleNLP</bdi> و <bdi dir="ltr">Ollama</bdi> و <bdi dir="ltr">vLLM</bdi> والمزيد.</li>
|
||||
<li>🤝 دمج <a href="https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2" dir="ltr">PP-DocBee2</a>، مما يتيح استخراج وفهم النصوص المطبوعة والمخطوطة والأختام والجداول والمخططات والعناصر الشائعة الأخرى في المستندات المعقدة.</li>
|
||||
</ol>
|
||||
|
||||
<p align="right">[<a href="https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html">سجل التحديثات</a>]</p>
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ التشغيل السريع
|
||||
### 1. تشغيل العرض التوضيحي عبر الإنترنت
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
|
||||
|
||||
### 2. التثبيت
|
||||
|
||||
قم بتثبيت PaddlePaddle بالرجوع إلى [دليل التثبيت](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)، وبعد ذلك، قم بتثبيت مجموعة أدوات PaddleOCR.
|
||||
|
||||
|
||||
<div style="text-align: left;">
|
||||
|
||||
```bash
|
||||
# Install paddleocr
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
</div>
|
||||
|
||||
### 3. تشغيل الاستدلال عبر واجهة سطر الأوامر (CLI)
|
||||
|
||||
<div style="text-align: left !important;">
|
||||
|
||||
```bash
|
||||
# Run PP-OCRv5 inference
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# Run PP-StructureV3 inference
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Get the Qianfan API Key at first, and then run PP-ChatOCRv4 inference
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Get more information about "paddleocr ocr"
|
||||
paddleocr ocr --help
|
||||
```
|
||||
|
||||
</div>
|
||||
|
||||
### 4. تشغيل الاستدلال عبر واجهة برمجة التطبيقات (API)
|
||||
|
||||
<details dir="ltr" open>
|
||||
<summary dir="rtl"><strong>4.1 مثال PP-OCRv5</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary dir="rtl"><strong>4.2 مثال PP-StructureV3</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# للصور
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# عرض النتائج وحفظها بصيغة JSON
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary dir="rtl"><strong>4.3 مثال PP-ChatOCRv4</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# إذا تم استخدام نموذج كبير متعدد الوسائط، فيجب بدء خدمة mllm المحلية. يمكنك الرجوع إلى الوثائق: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md لتنفيذ النشر وتحديث تكوين mllm_chat_bot_config.
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
</details>
|
||||
|
||||
|
||||
### 5. مسرّعات الذكاء الاصطناعي الصينية غير المتجانسة
|
||||
<ul dir="rtl">
|
||||
<li><a href="https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html">Huawei Ascend</a></li>
|
||||
<li><a href="https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html">KUNLUNXIN</a></li>
|
||||
</ul>
|
||||
|
||||
## ⛰️ دروس متقدمة
|
||||
- [درس PP-OCRv5 التعليمي](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [درس PP-StructureV3 التعليمي](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [درس PP-ChatOCRv4 التعليمي](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 نظرة سريعة على نتائج التنفيذ
|
||||
|
||||
|
||||
<p align="center">
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
|
||||
</p>
|
||||
|
||||
|
||||
|
||||
<p align="center">
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
|
||||
</p>
|
||||
|
||||
|
||||
## 👩👩👧👦 المجتمع
|
||||
|
||||
| حساب PaddlePaddle الرسمي على WeChat | انضم إلى مجموعة النقاش التقني |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
|
||||
## 😃 مشاريع رائعة تستخدم PaddleOCR
|
||||
لم يكن PaddleOCR ليصل إلى ما هو عليه اليوم بدون مجتمعه المذهل! 💗 شكرًا جزيلاً لجميع شركائنا القدامى، والمتعاونين الجدد، وكل من صب شغفه في PaddleOCR - سواء ذكرنا اسمك أم لا. دعمكم يشعل نارنا!
|
||||
|
||||
| اسم المشروع | الوصف |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|محرك RAG يعتمد على فهم عميق للوثائق.|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|أداة تحويل المستندات متعددة الأنواع إلى Markdown|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|برنامج OCR مجاني ومفتوح المصدر للعمل دفعة واحدة دون اتصال بالإنترنت.|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |أداة OmniParser: أداة تحليل الشاشة لوكيل واجهة المستخدم الرسومية المستند إلى الرؤية البحتة.|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |نظام سؤال وجواب يعتمد على أي شيء.|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|مجموعة أدوات قوية مفتوحة المصدر مصممة لاستخراج محتوى عالي الجودة بكفاءة من مستندات PDF المعقدة والمتنوعة.|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |يتعرف على النص على الشاشة، ويترجمه ويعرض نتائج الترجمة في الوقت الفعلي.|
|
||||
| [تعرف على المزيد من المشاريع](./awesome_projects.md) | [مشاريع أخرى تعتمد على PaddleOCR](./awesome_projects.md)|
|
||||
|
||||
## 👩👩👧👦 المساهمون
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
|
||||
## 🌟 نجمة
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 الترخيص
|
||||
هذا المشروع مرخص بموجب [ترخيص Apache 2.0](LICENSE).
|
||||
|
||||
</div>
|
||||
|
||||
## 🎓 الاستشهاد الأكاديمي
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
344
PaddleOCR-3.1.0/README_cn.md
Normal file
344
PaddleOCR-3.1.0/README_cn.md
Normal file
@ -0,0 +1,344 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner_cn.png" alt="PaddleOCR Banner">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
[English](./README.md) | 简体中文 | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 简介
|
||||
PaddleOCR自发布以来凭借学术前沿算法和产业落地实践,受到了产学研各方的喜爱,并被广泛应用于众多知名开源项目,例如:Umi-OCR、OmniParser、MinerU、RAGFlow等,已成为广大开发者心中的开源OCR领域的首选工具。2025年5月20日,飞桨团队发布**PaddleOCR 3.0**,全面适配**飞桨框架3.0正式版**,进一步**提升文字识别精度**,支持**多文字类型识别**和**手写体识别**,满足大模型应用对**复杂文档高精度解析**的旺盛需求,结合**ERNIE 4.5 Turbo**显著提升关键信息抽取精度,并新增**对昆仑芯、昇腾等国产硬件**的支持。完整使用文档请参考 [PaddleOCR 3.0 文档](https://paddlepaddle.github.io/PaddleOCR/latest/)。
|
||||
|
||||
PaddleOCR 3.0**新增**三大特色能力:
|
||||
- 全场景文字识别模型[PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):单模型支持五种文字类型和复杂手写体识别;整体识别精度相比上一代**提升13个百分点**。[在线体验](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
- 通用文档解析方案[PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支持多场景、多版式 PDF 高精度解析,在公开评测集中**领先众多开源和闭源方案**。[在线体验](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
- 智能文档理解方案[PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md):原生支持ERNIE 4.5 Turbo,精度相比上一代**提升15个百分点**。[在线体验](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
PaddleOCR 3.0除了提供优秀的模型库外,还提供好学易用的工具,覆盖模型训练、推理和服务化部署,方便开发者快速落地AI应用。
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch_cn.png" alt="PaddleOCR Architecture">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
## 📣 最新动态
|
||||
|
||||
|
||||
🔥🔥2025.06.29: **PaddleOCR 3.1.0** 发布,新增能力如下:
|
||||
|
||||
- **重要模型和产线:**
|
||||
- **新增 PP-OCRv5 多语种文本识别模型**,支持法语、西班牙语、葡萄牙语、俄语、韩语等 37 种语言的文字识别模型的训推流程。**平均精度涨幅超30%。**[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- 升级 PP-StructureV3 中的 **PP-Chart2Table 模型**,图表转表能力进一步升级,在内部自建测评集合上指标(RMS-F1)**提升 9.36 个百分点(71.24% -> 80.60%)。**
|
||||
- 新增基于 PP-StructureV3 和 ERNIE 4.5 Turbo 的**文档翻译产线 PP-DocTranslation,支持翻译 Markdown 格式文档、各种复杂版式的 PDF 文档和文档图像,结果保存为 Markdown 格式文档。**[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
- **新增MCP server:**[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/mcp_server.html)
|
||||
- **支持 OCR 和 PP-StructureV3 两种工具;**
|
||||
- 支持本地Python库、星河社区云服务、自托管服务三种工作模式;
|
||||
- 支持通过 stdio 调用本地服务,通过 Streamable HTTP 调用远程服务。
|
||||
|
||||
- **文档优化:** 优化了部分使用文档描述,提升阅读体验。
|
||||
|
||||
2025.06.26: **PaddleOCR 3.0.3** 发布,包含:
|
||||
- Bug修复:修复`enable_mkldnn`参数不生效的问题,恢复CPU默认使用MKL-DNN推理的行为。
|
||||
|
||||
2025.06.19: **PaddleOCR 3.0.2** 发布,包含:
|
||||
|
||||
- **功能新增:**
|
||||
- 模型默认下载源从`BOS`改为`HuggingFace`,同时也支持用户通过更改环境变量`PADDLE_PDX_MODEL_SOURCE`为`BOS`,将模型下载源设置为百度云对象存储BOS。
|
||||
- PP-OCRv5、PP-StructureV3、PP-ChatOCRv4等pipeline新增C++、Java、Go、C#、Node.js、PHP 6种语言的服务调用示例。
|
||||
- 优化PP-StructureV3产线中版面分区排序算法,对复杂竖版版面排序逻辑进行完善,进一步提升了复杂版面排序效果。
|
||||
- 优化模型选择逻辑,当指定语言、未指定模型版本时,自动选择支持该语言的最新版本的模型。
|
||||
- 为MKL-DNN缓存大小设置默认上界,防止缓存无限增长。同时,支持用户配置缓存容量。
|
||||
- 更新高性能推理默认配置,支持Paddle MKL-DNN加速。优化高性能推理自动配置逻辑,支持更智能的配置选择。
|
||||
- 调整默认设备获取逻辑,考虑环境中安装的Paddle框架对计算设备的实际支持情况,使程序行为更符合直觉。
|
||||
- 新增PP-OCRv5的Android端示例,[详情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/on_device_deployment.html)。
|
||||
|
||||
- **Bug修复:**
|
||||
- 修复PP-StructureV3部分CLI参数不生效的问题。
|
||||
- 修复部分情况下`export_paddlex_config_to_yaml`无法正常工作的问题。
|
||||
- 修复save_path实际行为与文档描述不符的问题。
|
||||
- 修复基础服务化部署在使用MKL-DNN时可能出现的多线程错误。
|
||||
- 修复Latex-OCR模型的图像预处理的通道顺序错误。
|
||||
- 修复文本识别模块保存可视化图像的通道顺序错误。
|
||||
- 修复PP-StructureV3中表格可视化结果通道顺序错误。
|
||||
- 修复PP-StructureV3产线中极特殊的情况下,计算overlap_ratio时,变量溢出问题。
|
||||
|
||||
- **文档优化:**
|
||||
- 更新文档中对`enable_mkldnn`参数的说明,使其更准确地描述程序的实际行为。
|
||||
- 修复文档中对`lang`和`ocr_version`参数描述的错误。
|
||||
- 补充通过CLI导出产线配置文件的说明。
|
||||
- 修复PP-OCRv5性能数据表格中的列缺失问题。
|
||||
- 润色PP-StructureV3在不同配置下的benchmark指标。
|
||||
|
||||
- **其他:**
|
||||
- 放松numpy、pandas等依赖的版本限制,恢复对Python 3.12的支持。
|
||||
|
||||
<details>
|
||||
<summary><strong>历史日志</strong></summary>
|
||||
|
||||
2025.06.05: **PaddleOCR 3.0.1** 发布,包含:
|
||||
|
||||
- **优化部分模型和模型配置:**
|
||||
- 更新 PP-OCRv5默认模型配置,检测和识别均由mobile改为server模型。为了改善大多数的场景默认效果,配置中的参数`limit_side_len`由736改为64
|
||||
- 新增文本行方向分类`PP-LCNet_x1_0_textline_ori`模型,精度99.42%,OCR、PP-StructureV3、PP-ChatOCRv4产线的默认文本行方向分类器改为该模型
|
||||
- 优化文本行方向分类`PP-LCNet_x0_25_textline_ori`模型,精度提升3.3个百分点,当前精度98.85%
|
||||
- **优化和修复3.0.0版本部分存在的问题,[详情](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)**
|
||||
|
||||
🔥🔥2025.05.20: **PaddleOCR 3.0** 正式发布,包含:
|
||||
- **PP-OCRv5**: 全场景高精度文字识别
|
||||
|
||||
1. 🌐 单模型支持**五种**文字类型(**简体中文**、**繁体中文**、**中文拼音**、**英文**和**日文**)。
|
||||
2. ✍️ 支持复杂**手写体**识别:复杂连笔、非规范字迹识别性能显著提升。
|
||||
3. 🎯 整体识别精度提升 - 多种应用场景达到 SOTA 精度, 相比上一版本PP-OCRv4,识别精度**提升13个百分点**!
|
||||
|
||||
- **PP-StructureV3**: 通用文档解析方案
|
||||
|
||||
1. 🧮 支持多场景 PDF 高精度解析,在 OmniDocBench 基准测试中**领先众多开源和闭源方案**。
|
||||
2. 🧠 多项专精能力: **印章识别**、**图表转表格**、**嵌套公式/图片的表格识别**、**竖排文本解析**及**复杂表格结构分析**等。
|
||||
|
||||
|
||||
- **PP-ChatOCRv4**: 智能文档理解方案
|
||||
1. 🔥 文档图像(PDF/PNG/JPG)关键信息提取精度相比上一代**提升15个百分点**!
|
||||
2. 💻 原生支持**ERNIE 4.5 Turbo**,还兼容 PaddleNLP、Ollama、vLLM 等工具部署的大模型。
|
||||
3. 🤝 集成 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2),支持印刷文字、手写体文字、印章信息、表格、图表等常见的复杂文档信息抽取和理解的能力。
|
||||
|
||||
[更多日志](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ 快速开始
|
||||
### 1. 在线体验
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. 本地安装
|
||||
|
||||
请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)完成**PaddlePaddle 3.0**的安装,然后安装paddleocr。
|
||||
|
||||
```bash
|
||||
# 安装 paddleocr
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. 命令行方式推理
|
||||
```bash
|
||||
# 运行 PP-OCRv5 推理
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# 运行 PP-StructureV3 推理
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# 运行 PP-ChatOCRv4 推理前,需要先获得千帆API Key
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# 查看 "paddleocr ocr" 详细参数
|
||||
paddleocr ocr --help
|
||||
```
|
||||
### 4. API方式推理
|
||||
|
||||
**4.1 PP-OCRv5 示例**
|
||||
```python
|
||||
from paddleocr import PaddleOCR
|
||||
# 初始化 PaddleOCR 实例
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# 对示例图像执行 OCR 推理
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# 可视化结果并保存 json 结果
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 PP-StructureV3 示例</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# For Image
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# 可视化结果并保存 json 结果
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 PP-ChatOCRv4 示例</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# 如果使用多模态大模型,需要启动本地 mllm 服务,可以参考文档:https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md 进行部署,并更新 mllm_chat_bot_config 配置。
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
### 5. **国产化硬件使用**
|
||||
- [昆仑芯安装指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
|
||||
- [昇腾安装指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
|
||||
|
||||
## ⛰️ 进阶指南
|
||||
- [PP-OCRv5 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [PP-StructureV3 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [PP-ChatOCRv4 使用教程](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 效果展示
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 开发者社区
|
||||
|
||||
| 扫码关注飞桨公众号 | 扫码加入技术交流群 |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
## 🏆 使用 PaddleOCR 的优秀项目
|
||||
PaddleOCR 的发展离不开社区贡献!💗衷心感谢所有开发者、合作伙伴与贡献者!
|
||||
| 项目名称 | 简介 |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|基于RAG的AI工作流引擎|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|多类型文档转换Markdown工具|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|开源批量离线OCR软件|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |基于纯视觉的GUI智能体屏幕解析工具|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |基于任意内容的问答系统|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|高效复杂PDF文档提取工具包|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |屏幕实时翻译工具|
|
||||
| [更多项目](./awesome_projects.md) | |
|
||||
|
||||
## 👩👩👧👦 贡献者
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 许可协议
|
||||
本项目的发布受[Apache 2.0 license](LICENSE)许可认证。
|
||||
|
||||
## 🎓 学术引用
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
343
PaddleOCR-3.1.0/README_es.md
Normal file
343
PaddleOCR-3.1.0/README_es.md
Normal file
@ -0,0 +1,343 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner.png" alt="Banner de PaddleOCR">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | Español | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 Introducción
|
||||
Desde su lanzamiento inicial, PaddleOCR ha sido ampliamente aclamado en las comunidades académica, industrial y de investigación, gracias a sus algoritmos de vanguardia y su rendimiento probado en aplicaciones del mundo real. Ya está impulsando proyectos populares de código abierto como Umi-OCR, OmniParser, MinerU y RAGFlow, convirtiéndose en el conjunto de herramientas de OCR de referencia para desarrolladores de todo el mundo.
|
||||
|
||||
El 20 de mayo de 2025, el equipo de PaddlePaddle presentó PaddleOCR 3.0, totalmente compatible con la versión oficial del framework **PaddlePaddle 3.0**. Esta actualización **aumenta aún más la precisión en el reconocimiento de texto**, añade soporte para el **reconocimiento de múltiples tipos de texto** y el **reconocimiento de escritura a mano**, y satisface la creciente demanda de las aplicaciones de grandes modelos para el **análisis (parsing) de alta precisión de documentos complejos**. En combinación con **ERNIE 4.5 Turbo**, mejora significativamente la precisión en la extracción de información clave. Para la documentación de uso completa, consulte la [Documentación de PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
|
||||
|
||||
Tres nuevas características principales en PaddleOCR 3.0:
|
||||
- Modelo de Reconocimiento de Texto en Escenarios Universales [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): Un único modelo que maneja cinco tipos de texto diferentes además de escritura a mano compleja. La precisión general de reconocimiento ha aumentado en 13 puntos porcentuales con respecto a la generación anterior. [Demo en línea](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
|
||||
- Solución de Análisis General de Documentos [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Ofrece un análisis de alta precisión de PDF con múltiples diseños y escenas, superando a muchas soluciones de código abierto y cerrado en benchmarks públicos. [Demo en línea](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
|
||||
- Solución de Comprensión Inteligente de Documentos [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Impulsado nativamente por el gran modelo ERNIE 4.5 Turbo, logrando una precisión 15 puntos porcentuales mayor que su predecesor. [Demo en línea](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
Además de proporcionar una excelente biblioteca de modelos, PaddleOCR 3.0 también ofrece herramientas fáciles de usar que cubren el entrenamiento de modelos, la inferencia y el despliegue de servicios, para que los desarrolladores puedan llevar rápidamente las aplicaciones de IA a producción.
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch.png" alt="Arquitectura de PaddleOCR">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 📣 Últimas actualizaciones
|
||||
|
||||
#### **2025.06.29: Lanzamiento de PaddleOCR 3.1.0**, incluye:
|
||||
|
||||
- **Modelos y flujos de trabajo clave:**
|
||||
- **Añadido el modelo de reconocimiento de texto multilingüe PP-OCRv5**, que soporta entrenamiento e inferencia para modelos de reconocimiento de texto en 37 idiomas, incluidos francés, español, portugués, ruso, coreano, etc. **Precisión media mejorada en más de un 30%.** [Detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- Actualizado el **modelo PP-Chart2Table** en PP-StructureV3, mejorando aún más la conversión de gráficos a tablas. En conjuntos de evaluación personalizados internos, la métrica (RMS-F1) **aumentó 9,36 puntos porcentuales (71,24% -> 80,60%).**
|
||||
- Nuevo **flujo de traducción de documentos, PP-DocTranslation, basado en PP-StructureV3 y ERNIE 4.5 Turbo**, que soporta la traducción de documentos en formato Markdown, diversos PDF de diseño complejo e imágenes de documentos, guardando los resultados en formato Markdown. [Detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
- **Nuevo servidor MCP:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
|
||||
- **Admite tanto OCR como los flujos de trabajo de PP-StructureV3.**
|
||||
- Soporta tres modos de trabajo: biblioteca local de Python, servicio en la nube de la comunidad AIStudio y servicio autohospedado.
|
||||
- Permite invocar servicios locales a través de stdio y servicios remotos a través de Streamable HTTP.
|
||||
|
||||
- **Optimización de la documentación:** Se han mejorado las descripciones en algunas guías de usuario para una experiencia de lectura más fluida.
|
||||
|
||||
#### 🔥🔥**2025.06.26: Lanzamiento de PaddleOCR 3.0.3, incluye:**
|
||||
|
||||
- Corrección de error: Se resolvió el problema donde el parámetro `enable_mkldnn` no era efectivo, restaurando el comportamiento predeterminado de usar MKL-DNN para la inferencia en CPU.
|
||||
|
||||
#### 🔥🔥**2025.06.19: Lanzamiento de PaddleOCR 3.0.2, incluye:**
|
||||
|
||||
- **Nuevas características:**
|
||||
|
||||
- La fuente de descarga predeterminada se ha cambiado de `BOS` a `HuggingFace`. Los usuarios también pueden cambiar la variable de entorno `PADDLE_PDX_MODEL_SOURCE` a `BOS` para volver a establecer la fuente de descarga del modelo en Baidu Object Storage (BOS).
|
||||
- Se agregaron ejemplos de invocación de servicios para seis idiomas (C++, Java, Go, C#, Node.js y PHP) para pipelines como PP-OCRv5, PP-StructureV3 y PP-ChatOCRv4.
|
||||
- Se mejoró el algoritmo de ordenación de particiones de diseño en el pipeline PP-StructureV3, mejorando la lógica de ordenación para diseños verticales complejos para ofrecer mejores resultados.
|
||||
- Lógica de selección de modelo mejorada: cuando se especifica un idioma pero no una versión del modelo, el sistema seleccionará automáticamente la última versión del modelo que admita ese idioma.
|
||||
- Se estableció un límite superior predeterminado para el tamaño de la caché de MKL-DNN para evitar un crecimiento ilimitado, al tiempo que se permite a los usuarios configurar la capacidad de la caché.
|
||||
- Se actualizaron las configuraciones predeterminadas para la inferencia de alto rendimiento para admitir la aceleración de Paddle MKL-DNN y se optimizó la lógica para la selección automática de configuración para elecciones más inteligentes.
|
||||
- Se ajustó la lógica para obtener el dispositivo predeterminado para considerar el soporte real de los dispositivos de computación por parte del framework Paddle instalado, lo que hace que el comportamiento del programa sea más intuitivo.
|
||||
- Añadido ejemplo de Android para PP-OCRv5. [Detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
|
||||
|
||||
- **Corrección de errores:**
|
||||
|
||||
- Se solucionó un problema con algunos parámetros de CLI en PP-StructureV3 que no tenían efecto.
|
||||
- Se resolvió un problema por el cual `export_paddlex_config_to_yaml` no funcionaba correctamente en ciertos casos.
|
||||
- Se corrigió la discrepancia entre el comportamiento real de `save_path` y la descripción de su documentación.
|
||||
- Se corrigieron posibles errores de subprocesos múltiples al usar MKL-DNN en la implementación de servicios básicos.
|
||||
- Se corrigieron errores en el orden de los canales en el preprocesamiento de imágenes para el modelo Latex-OCR.
|
||||
- Se corrigieron errores en el orden de los canales al guardar imágenes visualizadas dentro del módulo de reconocimiento de texto.
|
||||
- Se resolvieron errores de orden de canales en los resultados de tablas visualizadas dentro del pipeline de PP-StructureV3.
|
||||
- Se solucionó un problema de desbordamiento en el cálculo de `overlap_ratio` en circunstancias extremadamente especiales en el pipeline PP-StructureV3.
|
||||
|
||||
- **Mejoras en la documentación:**
|
||||
|
||||
- Se actualizó la descripción del parámetro `enable_mkldnn` en la documentación para reflejar con precisión el comportamiento real del programa.
|
||||
- Se corrigieron errores en la documentación con respecto a los parámetros `lang` y `ocr_version`.
|
||||
- Se agregaron instrucciones para exportar archivos de configuración de la línea de producción a través de CLI.
|
||||
- Se corrigieron las columnas que faltaban en la tabla de datos de rendimiento para PP-OCRv5.
|
||||
- Se refinaron las métricas de referencia para PP-StructureV3 en diferentes configuraciones.
|
||||
|
||||
- **Otros:**
|
||||
|
||||
- Se flexibilizaron las restricciones de versión en dependencias como numpy y pandas, restaurando el soporte para Python 3.12.
|
||||
|
||||
<details>
|
||||
<summary><strong>Historial de actualizaciones</strong></summary>
|
||||
|
||||
#### **🔥🔥 2025.06.05: Lanzamiento de PaddleOCR 3.0.1, incluye:**
|
||||
|
||||
- **Optimización de ciertos modelos y configuraciones de modelos:**
|
||||
- Actualizada la configuración de modelo por defecto para PP-OCRv5, cambiando tanto la detección como el reconocimiento de modelos `mobile` a `server`. Para mejorar el rendimiento por defecto en la mayoría de los escenarios, el parámetro `limit_side_len` en la configuración ha sido cambiado de 736 a 64.
|
||||
- Añadido un nuevo modelo de clasificación de orientación de línea de texto `PP-LCNet_x1_0_textline_ori` con una precisión del 99.42%. El clasificador de orientación de línea de texto por defecto para los pipelines de OCR, PP-StructureV3 y PP-ChatOCRv4 ha sido actualizado a este modelo.
|
||||
- Optimizado el modelo de clasificación de orientación de línea de texto `PP-LCNet_x0_25_textline_ori`, mejorando la precisión en 3.3 puntos porcentuales hasta una precisión actual del 98.85%.
|
||||
|
||||
- **Optimizaciones y correcciones de algunos problemas en la versión 3.0.0, [detalles](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
|
||||
|
||||
🔥🔥2025.05.20: Lanzamiento oficial de **PaddleOCR v3.0**, incluyendo:
|
||||
- **PP-OCRv5**: Modelo de Reconocimiento de Texto de Alta Precisión para Todos los Escenarios - Texto Instantáneo desde Imágenes/PDFs.
|
||||
1. 🌐 Soporte en un único modelo para **cinco** tipos de texto - Procese sin problemas **Chino Simplificado, Chino Tradicional, Pinyin de Chino Simplificado, Inglés** y **Japonés** dentro de un solo modelo.
|
||||
2. ✍️ **Reconocimiento de escritura a mano** mejorado: Significativamente mejor en escritura cursiva compleja y caligrafía no estándar.
|
||||
3. 🎯 **Ganancia de precisión de 13 puntos** sobre PP-OCRv4, alcanzando un rendimiento de vanguardia (state-of-the-art) en una variedad de escenarios del mundo real.
|
||||
|
||||
- **PP-StructureV3**: Solución de Análisis de Documentos de Propósito General – ¡Libere el poder del análisis SOTA de Imágenes/PDFs para escenarios del mundo real!
|
||||
1. 🧮 **Análisis de PDF multiescena de alta precisión**, liderando tanto a las soluciones de código abierto como a las de código cerrado en el benchmark OmniDocBench.
|
||||
2. 🧠 Capacidades especializadas que incluyen **reconocimiento de sellos**, **conversión de gráficos a tablas**, **reconocimiento de tablas con fórmulas/imágenes anidadas**, **análisis de documentos de texto vertical** y **análisis de estructuras de tablas complejas**.
|
||||
|
||||
- **PP-ChatOCRv4**: Solución Inteligente de Comprensión de Documentos – Extraiga Información Clave, no solo texto de Imágenes/PDFs.
|
||||
1. 🔥 **Ganancia de precisión de 15 puntos** en la extracción de información clave en archivos PDF/PNG/JPG con respecto a la generación anterior.
|
||||
2. 💻 Soporte nativo para **ERNIE 4.5 Turbo**, con compatibilidad para despliegues de modelos grandes a través de PaddleNLP, Ollama, vLLM y más.
|
||||
3. 🤝 Integrado con [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), permitiendo la extracción y comprensión de texto impreso, escritura a mano, sellos, tablas, gráficos y otros elementos comunes en documentos complejos.
|
||||
|
||||
[Historial de actualizaciones](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ Inicio rápido
|
||||
### 1. Ejecutar demo en línea
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. Instalación
|
||||
|
||||
Instale PaddlePaddle consultando la [Guía de Instalación](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), y después, instale el toolkit de PaddleOCR.
|
||||
|
||||
```bash
|
||||
# Instalar paddleocr
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. Ejecutar inferencia por CLI
|
||||
```bash
|
||||
# Ejecutar inferencia de PP-OCRv5
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# Ejecutar inferencia de PP-StructureV3
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Obtenga primero la API Key de Qianfan y luego ejecute la inferencia de PP-ChatOCRv4
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Obtener más información sobre "paddleocr ocr"
|
||||
paddleocr ocr --help
|
||||
```
|
||||
|
||||
### 4. Ejecutar inferencia por API
|
||||
**4.1 Ejemplo de PP-OCRv5**
|
||||
```python
|
||||
from paddleocr import PaddleOCR
|
||||
# Inicializar la instancia de PaddleOCR
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# Ejecutar inferencia de OCR en una imagen de ejemplo
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# Visualizar los resultados y guardar los resultados en JSON
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 Ejemplo de PP-StructureV3</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# Para Imagen
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# Visualizar los resultados y guardar los resultados en JSON
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 Ejemplo de PP-ChatOCRv4</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # su api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # su api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# Si se utiliza un modelo grande multimodal, es necesario iniciar el servicio mllm local. Puede consultar la documentación: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md para realizar el despliegue y actualizar la configuración de mllm_chat_bot_config.
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # la URL de su servicio mllm local
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # su api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## ⛰️ Tutoriales avanzados
|
||||
- [Tutorial de PP-OCRv5](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [Tutorial de PP-StructureV3](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [Tutorial de PP-ChatOCRv4](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 Vista rápida de los resultados de ejecución
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="Demo de PP-OCRv5">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="Demo de PP-StructureV3">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 Comunidad
|
||||
|
||||
| Cuenta oficial de PaddlePaddle en WeChat | Únase al grupo de discusión técnica |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
|
||||
## 😃 Proyectos increíbles que aprovechan PaddleOCR
|
||||
¡PaddleOCR no estaría donde está hoy sin su increíble comunidad! 💗 Un enorme agradecimiento a todos nuestros socios de siempre, nuevos colaboradores y a todos los que han volcado su pasión en PaddleOCR, ya sea que los hayamos nombrado o no. ¡Su apoyo alimenta nuestro fuego!
|
||||
|
||||
| Nombre del Proyecto | Descripción |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|Motor de RAG basado en la comprensión profunda de documentos.|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Herramienta de conversión de documentos de múltiples tipos a Markdown.|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Software de OCR por lotes, sin conexión, gratuito y de código abierto.|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Herramienta de análisis de pantalla para agentes GUI basados puramente en visión.|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Preguntas y respuestas basadas en cualquier cosa.|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|Un potente toolkit de código abierto diseñado para extraer eficientemente contenido de alta calidad de documentos PDF complejos y diversos.|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |Reconoce texto en la pantalla, lo traduce y muestra los resultados de la traducción en tiempo real.|
|
||||
| [Conozca más proyectos](./awesome_projects.md) | [Más proyectos basados en PaddleOCR](./awesome_projects.md)|
|
||||
|
||||
## 👩👩👧👦 Contribuidores
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 Licencia
|
||||
Este proyecto se publica bajo la [licencia Apache 2.0](LICENSE).
|
||||
|
||||
## 🎓 Citación
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
342
PaddleOCR-3.1.0/README_fr.md
Normal file
342
PaddleOCR-3.1.0/README_fr.md
Normal file
@ -0,0 +1,342 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner.png" alt="Bannière PaddleOCR">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | Français | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 Introduction
|
||||
Depuis sa sortie initiale, PaddleOCR a été largement acclamé par les milieux universitaires, industriels et de la recherche, grâce à ses algorithmes de pointe et à ses performances éprouvées dans des applications réelles. Il alimente déjà des projets open-source populaires tels que Umi-OCR, OmniParser, MinerU et RAGFlow, ce qui en fait la boîte à outils OCR de référence pour les développeurs du monde entier.
|
||||
|
||||
Le 20 mai 2025, l'équipe de PaddlePaddle a dévoilé PaddleOCR 3.0, entièrement compatible avec la version officielle du framework **PaddlePaddle 3.0**. Cette mise à jour **améliore encore la précision de la reconnaissance de texte**, ajoute la prise en charge de la **reconnaissance de multiples types de texte** et de la **reconnaissance de l'écriture manuscrite**, et répond à la demande croissante des applications de grands modèles pour l'**analyse de haute précision de documents complexes**. Combiné avec **ERNIE 4.5 Turbo**, il améliore considérablement la précision de l'extraction d'informations clés. Pour la documentation d'utilisation complète, veuillez vous référer à la [Documentation de PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
|
||||
|
||||
Trois nouvelles fonctionnalités majeures dans PaddleOCR 3.0 :
|
||||
- Modèle de reconnaissance de texte pour toutes scènes [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md) : Un modèle unique qui gère cinq types de texte différents ainsi que l'écriture manuscrite complexe. La précision globale de la reconnaissance a augmenté de 13 points de pourcentage par rapport à la génération précédente. [Démo en ligne](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
|
||||
- Solution d'analyse de documents générique [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md) : Fournit une analyse de haute précision des PDF multi-mises en page et multi-scènes, surpassant de nombreuses solutions open-source et propriétaires sur les benchmarks publics. [Démo en ligne](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
|
||||
- Solution de compréhension de documents intelligente [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md) : Nativement propulsé par le grand modèle ERNIE 4.5 Turbo, atteignant une précision supérieure de 15 points de pourcentage à celle de son prédécesseur. [Démo en ligne](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
En plus de fournir une bibliothèque de modèles exceptionnelle, PaddleOCR 3.0 propose également des outils conviviaux couvrant l'entraînement de modèles, l'inférence et le déploiement de services, afin que les développeurs puissent rapidement mettre en production des applications d'IA.
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch.png" alt="Architecture de PaddleOCR">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 📣 Mises à jour récentes
|
||||
|
||||
#### **29/06/2025 : Sortie de PaddleOCR 3.1.0**, comprend :
|
||||
|
||||
- **Modèles et pipelines principaux :**
|
||||
- **Ajout du modèle de reconnaissance de texte multilingue PP-OCRv5**, prenant en charge l'entraînement et l'inférence pour 37 langues, dont le français, l'espagnol, le portugais, le russe, le coréen, etc. **Précision moyenne améliorée de plus de 30 %.** [Détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- Mise à niveau du **modèle PP-Chart2Table** dans PP-StructureV3, améliorant davantage la conversion des graphiques en tableaux. Sur des ensembles d'évaluation internes personnalisés, la métrique (RMS-F1) **a augmenté de 9,36 points de pourcentage (71,24 % -> 80,60 %).**
|
||||
- Lancement du **pipeline de traduction de documents, PP-DocTranslation, basé sur PP-StructureV3 et ERNIE 4.5 Turbo**, prenant en charge la traduction des documents au format Markdown, des PDF à mise en page complexe, et des images de documents, avec sauvegarde des résultats au format Markdown. [Détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
- **Nouveau serveur MCP :** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
|
||||
- **Prend en charge les pipelines OCR et PP-StructureV3.**
|
||||
- Prend en charge trois modes de fonctionnement : bibliothèque Python locale, service cloud communautaire AIStudio et service auto-hébergé.
|
||||
- Prend en charge l'appel des services locaux via stdio et des services distants via Streamable HTTP.
|
||||
|
||||
- **Optimisation de la documentation :** Amélioration des descriptions dans certains guides utilisateurs pour une expérience de lecture plus fluide.
|
||||
|
||||
#### **26/06/2025 : Publication de PaddleOCR 3.0.3, incluant :**
|
||||
|
||||
- Correction de bug : Résolution du problème où le paramètre `enable_mkldnn` ne fonctionnait pas, rétablissant le comportement par défaut d'utilisation de MKL-DNN pour l'inférence CPU.
|
||||
|
||||
#### 🔥🔥**19/06/2025 : Publication de PaddleOCR 3.0.2, incluant :**
|
||||
|
||||
- **Nouvelles fonctionnalités :**
|
||||
|
||||
- La source de téléchargement par défaut a été changée de `BOS` à `HuggingFace`. Les utilisateurs peuvent également changer la variable d'environnement `PADDLE_PDX_MODEL_SOURCE` en `BOS` pour rétablir la source de téléchargement sur Baidu Object Storage (BOS).
|
||||
- Ajout d'exemples d'appel de service pour six langues — C++, Java, Go, C#, Node.js et PHP — pour les pipelines tels que PP-OCRv5, PP-StructureV3 et PP-ChatOCRv4.
|
||||
- Amélioration de l'algorithme de tri de partition de mise en page dans le pipeline PP-StructureV3, améliorant la logique de tri pour les mises en page verticales complexes afin de fournir de meilleurs résultats.
|
||||
- Logique de sélection de modèle améliorée : lorsqu'une langue est spécifiée mais pas une version de modèle, le système sélectionnera automatiquement la dernière version du modèle prenant en charge cette langue.
|
||||
- Définition d'une limite supérieure par défaut pour la taille du cache MKL-DNN afin d'éviter une croissance illimitée, tout en permettant aux utilisateurs de configurer la capacité du cache.
|
||||
- Mise à jour des configurations par défaut pour l'inférence haute performance afin de prendre en charge l'accélération Paddle MKL-DNN et optimisation de la logique de sélection automatique de la configuration pour des choix plus intelligents.
|
||||
- Ajustement de la logique d'obtention du périphérique par défaut pour tenir compte du support réel des dispositifs de calcul par le framework Paddle installé, rendant le comportement du programme plus intuitif.
|
||||
- Ajout d'un exemple Android pour PP-OCRv5. [Détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
|
||||
|
||||
- **Corrections de bugs :**
|
||||
|
||||
- Correction d'un problème où certains paramètres CLI dans PP-StructureV3 ne prenaient pas effet.
|
||||
- Résolution d'un problème où `export_paddlex_config_to_yaml` ne fonctionnait pas correctement dans certains cas.
|
||||
- Correction de l'écart entre le comportement réel de `save_path` et sa description dans la documentation.
|
||||
- Correction d'erreurs potentielles de multithreading lors de l'utilisation de MKL-DNN dans le déploiement de services de base.
|
||||
- Correction des erreurs d'ordre des canaux dans le prétraitement des images pour le modèle Latex-OCR.
|
||||
- Correction des erreurs d'ordre des canaux lors de la sauvegarde des images visualisées dans le module de reconnaissance de texte.
|
||||
- Résolution des erreurs d'ordre des canaux dans les résultats de tableaux visualisés dans le pipeline PP-StructureV3.
|
||||
- Correction d'un problème de débordement dans le calcul de `overlap_ratio` dans des circonstances très spéciales dans le pipeline PP-StructureV3.
|
||||
|
||||
- **Améliorations de la documentation :**
|
||||
|
||||
- Mise à jour de la description du paramètre `enable_mkldnn` dans la documentation pour refléter précisément le comportement réel du programme.
|
||||
- Correction d'erreurs dans la documentation concernant les paramètres `lang` et `ocr_version`.
|
||||
- Ajout d'instructions pour l'exportation des fichiers de configuration de la ligne de production via CLI.
|
||||
- Correction des colonnes manquantes dans le tableau de données de performance pour PP-OCRv5.
|
||||
- Affinement des métriques de benchmark pour PP-StructureV3 pour différentes configurations.
|
||||
|
||||
- **Autres :**
|
||||
|
||||
- Assouplissement des restrictions de version sur les dépendances comme numpy et pandas, restaurant la prise en charge de Python 3.12.
|
||||
|
||||
<details>
|
||||
<summary><strong>Historique des mises à jour</strong></summary>
|
||||
|
||||
#### **🔥🔥 05/06/2025 : Publication de PaddleOCR 3.0.1, incluant :**
|
||||
|
||||
- **Optimisation de certains modèles et de leurs configurations :**
|
||||
- Mise à jour de la configuration par défaut du modèle pour PP-OCRv5, en passant les modèles de détection et de reconnaissance de `mobile` à `server`. Pour améliorer les performances par défaut dans la plupart des scénarios, le paramètre `limit_side_len` dans la configuration a été changé de 736 à 64.
|
||||
- Ajout d'un nouveau modèle de classification de l'orientation des lignes de texte `PP-LCNet_x1_0_textline_ori` avec une précision de 99.42%. Le classifieur d'orientation de ligne de texte par défaut pour les pipelines OCR, PP-StructureV3 et PP-ChatOCRv4 a été mis à jour vers ce modèle.
|
||||
- Optimisation du modèle de classification de l'orientation des lignes de texte `PP-LCNet_x0_25_textline_ori`, améliorant la précision de 3,3 points de pourcentage pour atteindre une précision actuelle de 98,85%.
|
||||
|
||||
- **Optimisations et corrections de certains problèmes de la version 3.0.0, [détails](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
|
||||
|
||||
🔥🔥20/05/2025 : Lancement officiel de **PaddleOCR v3.0**, incluant :
|
||||
- **PP-OCRv5** : Modèle de reconnaissance de texte de haute précision pour tous les scénarios - Texte instantané à partir d'images/PDF.
|
||||
1. 🌐 Prise en charge par un seul modèle de **cinq** types de texte - Traitez de manière transparente le **chinois simplifié, le chinois traditionnel, le pinyin chinois simplifié, l'anglais** et le **japonais** au sein d'un seul modèle.
|
||||
2. ✍️ **Reconnaissance de l'écriture manuscrite** améliorée : Nettement plus performant sur les écritures cursives complexes et non standard.
|
||||
3. 🎯 **Gain de précision de 13 points** par rapport à PP-OCRv4, atteignant des performances de pointe dans une variété de scénarios réels.
|
||||
|
||||
- **PP-StructureV3** : Analyse de documents à usage général – Libérez une analyse d'images/PDF de pointe pour des scénarios du monde réel !
|
||||
1. 🧮 **Analyse de PDF multi-scènes de haute précision**, devançant les solutions open-source et propriétaires sur le benchmark OmniDocBench.
|
||||
2. 🧠 Les capacités spécialisées incluent la **reconnaissance de sceaux**, la **conversion de graphiques en tableaux**, la **reconnaissance de tableaux avec formules/images imbriquées**, l'**analyse de documents à texte vertical** et l'**analyse de structures de tableaux complexes**.
|
||||
|
||||
- **PP-ChatOCRv4** : Compréhension intelligente de documents – Extrayez des informations clés, pas seulement du texte, à partir d'images/PDF.
|
||||
1. 🔥 **Gain de précision de 15 points** dans l'extraction d'informations clés sur les fichiers PDF/PNG/JPG par rapport à la génération précédente.
|
||||
2. 💻 Prise en charge native de **ERNIE 4.5 Turbo**, avec une compatibilité pour les déploiements de grands modèles via PaddleNLP, Ollama, vLLM, et plus encore.
|
||||
3. 🤝 Intégration de [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), permettant l'extraction et la compréhension de texte imprimé, d'écriture manuscrite, de sceaux, de tableaux, de graphiques et d'autres éléments courants dans les documents complexes.
|
||||
|
||||
[Historique des mises à jour](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ Démarrage Rapide
|
||||
### 1. Lancer la démo en ligne
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. Installation
|
||||
|
||||
Installez PaddlePaddle en vous référant au [Guide d'installation](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), puis installez la boîte à outils PaddleOCR.
|
||||
|
||||
```bash
|
||||
# Installer paddleocr
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. Exécuter l'inférence par CLI
|
||||
```bash
|
||||
# Exécuter l'inférence PP-OCRv5
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# Exécuter l'inférence PP-StructureV3
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Obtenez d'abord la clé API Qianfan, puis exécutez l'inférence PP-ChatOCRv4
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Obtenir plus d'informations sur "paddleocr ocr"
|
||||
paddleocr ocr --help
|
||||
```
|
||||
|
||||
### 4. Exécuter l'inférence par API
|
||||
**4.1 Exemple PP-OCRv5**
|
||||
```python
|
||||
# Initialiser l'instance de PaddleOCR
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# Exécuter l'inférence OCR sur un exemple d'image
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# Visualiser les résultats et sauvegarder les résultats JSON
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 Exemple PP-StructureV3</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# Pour une image
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# Visualiser les résultats et sauvegarder les résultats JSON
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 Exemple PP-ChatOCRv4</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # votre api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # votre api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# Si un grand modèle multimodal est utilisé, le service mllm local doit être démarré. Vous pouvez vous référer à la documentation : https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md pour effectuer le déploiement et mettre à jour la configuration mllm_chat_bot_config.
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # url de votre service mllm local
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # votre api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## ⛰️ Tutoriels avancés
|
||||
- [Tutoriel PP-OCRv5](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [Tutoriel PP-StructureV3](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [Tutoriel PP-ChatOCRv4](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 Aperçu rapide des résultats d'exécution
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="Démo PP-OCRv5">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="Démo PP-StructureV3">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 Communauté
|
||||
|
||||
| Compte officiel WeChat de PaddlePaddle | Rejoignez le groupe de discussion technique |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
## 😃 Projets formidables utilisant PaddleOCR
|
||||
PaddleOCR ne serait pas là où il est aujourd'hui sans son incroyable communauté ! 💗 Un immense merci à tous nos partenaires de longue date, nos nouveaux collaborateurs, et tous ceux qui ont mis leur passion dans PaddleOCR — que nous vous ayons nommés ou non. Votre soutien nous anime !
|
||||
|
||||
| Nom du projet | Description |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|Moteur RAG basé sur la compréhension profonde des documents.|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Outil de conversion de documents multi-types en Markdown|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Logiciel d'OCR hors ligne, gratuit, open-source et par lots.|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a>|Outil d'analyse d'écran pour agent GUI basé sur la vision pure.|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a>|Questions et réponses basées sur n'importe quel contenu.|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|Une puissante boîte à outils open-source conçue pour extraire efficacement du contenu de haute qualité à partir de documents PDF complexes et diversifiés.|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a>|Reconnaît le texte à l'écran, le traduit et affiche les résultats de la traduction en temps réel.|
|
||||
| [En savoir plus](./awesome_projects.md) | [Plus de projets basés sur PaddleOCR](./awesome_projects.md)|
|
||||
|
||||
## 👩👩👧👦 Contributeurs
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
## 📄 Licence
|
||||
Ce projet est publié sous la [licence Apache 2.0](LICENSE).
|
||||
|
||||
## 🎓 Citation
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
340
PaddleOCR-3.1.0/README_ja.md
Normal file
340
PaddleOCR-3.1.0/README_ja.md
Normal file
@ -0,0 +1,340 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR Banner">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | 日本語 | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 概要
|
||||
PaddleOCRは、その最先端のアルゴリズムと実世界での応用実績により、初回リリース以来、学術界、産業界、研究コミュニティから広く支持を得ています。Umi-OCR、OmniParser、MinerU、RAGFlowなどの人気オープンソースプロジェクトで既に採用されており、世界中の開発者にとって定番のOCRツールキットとなっています。
|
||||
|
||||
2025年5月20日、PaddlePaddleチームは**PaddlePaddle 3.0**フレームワークの公式リリースに完全対応したPaddleOCR 3.0を発表しました。このアップデートでは、**テキスト認識精度**がさらに向上し、**複数テキストタイプの認識**と**手書き文字認識**がサポートされ、大規模モデルアプリケーションからの**複雑なドキュメントの高精度解析**に対する高まる需要に応えます。**ERNIE 4.5 Turbo**と組み合わせることで、キー情報抽出の精度が大幅に向上します。完全な使用方法については、[PaddleOCR 3.0 ドキュメント](https://paddlepaddle.github.io/PaddleOCR/latest/ja/index.html) をご参照ください。
|
||||
|
||||
PaddleOCR 3.0の3つの主要な新機能:
|
||||
- 全シーン対応テキスト認識モデル [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): 1つのモデルで5つの異なるテキストタイプと複雑な手書き文字を処理。全体の認識精度は前世代に比べて13パーセントポイント向上。[オンラインデモ](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
|
||||
- 汎用ドキュメント解析ソリューション [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): 複数レイアウト、複数シーンのPDFの高精度解析を実現し、公開ベンチマークで多くのオープンソースおよびクローズドソースのソリューションを凌駕。[オンラインデモ](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
|
||||
- インテリジェントドキュメント理解ソリューション [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): ERNIE 4.5 Turboにネイティブで対応し、前世代よりも15パーセントポイント高い精度を達成。[オンラインデモ](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
PaddleOCR 3.0は、優れたモデルライブラリを提供するだけでなく、モデルのトレーニング、推論、サービス展開をカバーする使いやすいツールも提供しており、開発者がAIアプリケーションを迅速に本番環境に導入できるよう支援します。
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR Architecture">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
## 📣 最近のアップデート
|
||||
|
||||
#### **2025.06.29:PaddleOCR 3.1.0 をリリース**、内容は以下の通りです:
|
||||
|
||||
- **主なモデルとパイプライン:**
|
||||
- **PP-OCRv5 多言語テキスト認識モデルを追加**、フランス語、スペイン語、ポルトガル語、ロシア語、韓国語など 37 言語に対応。**平均精度が 30%以上向上。** [詳細](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- PP-StructureV3 の **PP-Chart2Table モデル**をアップグレードし、グラフから表への変換能力をさらに強化。社内カスタム評価セットでは、指標(RMS-F1)が **9.36 ポイント向上(71.24% → 80.60%)。**
|
||||
- PP-StructureV3 および ERNIE 4.5 Turbo に基づく**ドキュメント翻訳パイプライン PP-DocTranslation**を新たに追加。Markdown 形式ドキュメント、さまざまな複雑レイアウトの PDF ドキュメント、ドキュメント画像の翻訳に対応し、結果を Markdown 形式で保存可能。[詳細](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
- **新しい MCP サーバー:**[Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
|
||||
- **OCR と PP-StructureV3 パイプラインの両方をサポートします。**
|
||||
- ローカル Python ライブラリ、AIStudio コミュニティクラウドサービス、セルフホストサービスの3つの動作モードをサポートします。
|
||||
- stdio を介してローカルサービスを呼び出し、Streamable HTTP を介してリモートサービスを呼び出すことができます。
|
||||
|
||||
- **ドキュメント最適化:** 一部のユーザーガイドの説明を改善し、よりスムーズな読書体験を提供。
|
||||
|
||||
#### 🔥🔥**2025.06.26: PaddleOCR 3.0.3のリリース、以下の内容を含みます:**
|
||||
|
||||
- バグ修正:`enable_mkldnn`パラメータが機能しない問題を修正し、CPUがデフォルトでMKL-DNN推論を使用する動作を復元しました。
|
||||
|
||||
#### 🔥🔥**2025.06.19: PaddleOCR 3.0.2のリリース、以下の内容を含みます:**
|
||||
|
||||
- **新機能:**
|
||||
- デフォルトのダウンロード元が`BOS`から`HuggingFace`に変更されました。ユーザーは環境変数 `PADDLE_PDX_MODEL_SOURCE` を `BOS` に変更することで、モデルのダウンロード元をBaidu Object Storage (BOS)に戻すこともできます。
|
||||
- PP-OCRv5、PP-StructureV3、PP-ChatOCRv4などのパイプラインに、C++、Java、Go、C#、Node.js、PHPの6言語のサービス呼び出し例を追加しました。
|
||||
- PP-StructureV3パイプラインのレイアウト分割ソートアルゴリズムを改善し、複雑な縦書きレイアウトのソートロジックを強化して、より良い結果を提供します。
|
||||
- モデル選択ロジックを強化:言語が指定されているがモデルのバージョンが指定されていない場合、システムはその言語をサポートする最新のモデルバージョンを自動的に選択します。
|
||||
- MKL-DNNキャッシュサイズにデフォルトの上限を設定し、無制限の増加を防ぎます。同時に、ユーザーがキャッシュ容量を設定することも可能です。
|
||||
- 高性能推論のデフォルト設定を更新し、Paddle MKL-DNNアクセラレーションをサポートし、よりスマートな選択のための自動設定選択ロジックを最適化しました。
|
||||
- インストールされているPaddleフレームワークによる計算デバイスの実際のサポートを考慮するようにデフォルトデバイスの取得ロジックを調整し、プログラムの動作をより直感的にしました。
|
||||
- PP-OCRv5のAndroidサンプルを追加しました。[詳細](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html)。
|
||||
- **バグ修正:**
|
||||
- PP-StructureV3の一部のCLIパラメータが有効にならない問題を修正しました。
|
||||
- 特定のケースで`export_paddlex_config_to_yaml`が正しく機能しない問題を解決しました。
|
||||
- `save_path`の実際の動作とそのドキュメントの記述との間の不一致を修正しました。
|
||||
- 基本的なサービス展開でMKL-DNNを使用する際の潜在的なマルチスレッドエラーを修正しました。
|
||||
- Latex-OCRモデルの画像前処理におけるチャネル順序のエラーを修正しました。
|
||||
- テキスト認識モジュールで可視化画像を保存する際のチャネル順序のエラーを修正しました。
|
||||
- PP-StructureV3パイプラインで可視化されたテーブル結果のチャネル順序のエラーを解決しました。
|
||||
- PP-StructureV3パイプラインで非常に特殊な状況下で`overlap_ratio`を計算する際のオーバーフロー問題を修正しました。
|
||||
- **ドキュメントの改善:**
|
||||
- ドキュメント内の`enable_mkldnn`パラメータの説明を更新し、プログラムの実際の動作を正確に反映するようにしました。
|
||||
- `lang`および`ocr_version`パラメータに関するドキュメントのエラーを修正しました。
|
||||
- CLIを介してプロダクションライン設定ファイルをエクスポートする手順を追加しました。
|
||||
- PP-OCRv5のパフォーマンスデータテーブルで欠落していた列を修正しました。
|
||||
- さまざまな構成におけるPP-StructureV3のベンチマーク指標を洗練しました。
|
||||
- **その他:**
|
||||
- numpyやpandasなどの依存関係のバージョン制限を緩和し、Python 3.12のサポートを復元しました。
|
||||
|
||||
<details>
|
||||
<summary><strong>更新履歴</strong></summary>
|
||||
|
||||
#### **🔥🔥 2025.06.05: PaddleOCR 3.0.1のリリース、以下の内容を含みます:**
|
||||
|
||||
- **一部のモデルとモデル設定の最適化:**
|
||||
- PP-OCRv5のデフォルトモデル設定を更新し、検出と認識の両方をmobileモデルからserverモデルに変更しました。ほとんどのシーンでのデフォルト性能を向上させるため、設定の`limit_side_len`パラメータを736から64に変更しました。
|
||||
- 新しいテキスト行方向分類モデル`PP-LCNet_x1_0_textline_ori`(精度99.42%)を追加しました。OCR、PP-StructureV3、およびPP-ChatOCRv4パイプラインのデフォルトのテキスト行方向分類器がこのモデルに更新されました。
|
||||
- テキスト行方向分類モデル`PP-LCNet_x0_25_textline_ori`を最適化し、精度が3.3パーセントポイント向上し、現在の精度は98.85%です。
|
||||
|
||||
- **バージョン3.0.0の一部の問題の最適化と修正、[詳細](https://paddlepaddle.github.io/PaddleOCR/latest/ja/update/update.html)**
|
||||
|
||||
🔥🔥2025.05.20: **PaddleOCR v3.0**の公式リリース、以下の内容を含みます:
|
||||
- **PP-OCRv5**: あらゆるシーンに対応する高精度テキスト認識モデル - 画像/PDFから瞬時にテキストを抽出。
|
||||
1. 🌐 単一モデルで**5つ**のテキストタイプをサポート - **簡体字中国語、繁体字中国語、簡体字中国語ピンイン、英語**、**日本語**をシームレスに処理。
|
||||
2. ✍️ **手書き文字認識**の向上:複雑な草書体や非標準的な手書き文字の認識性能が大幅に向上。
|
||||
3. 🎯 PP-OCRv4に比べて**13ポイントの精度向上**を達成し、さまざまな実世界のシナリオで最先端の性能を実現。
|
||||
|
||||
- **PP-StructureV3**: 汎用ドキュメント解析 – 実世界のシナリオで最先端の画像/PDF解析を解放!
|
||||
1. 🧮 **高精度な複数シーンPDF解析**により、OmniDocBenchベンチマークでオープンソースおよびクローズドソースのソリューションをリード。
|
||||
2. 🧠 **印鑑認識**、**グラフからテーブルへの変換**、**ネストされた数式/画像を含むテーブル認識**、**縦書きテキスト文書の解析**、**複雑なテーブル構造分析**などの専門機能。
|
||||
|
||||
- **PP-ChatOCRv4**: インテリジェントなドキュメント理解 – 画像/PDFからテキストだけでなく、キー情報を抽出。
|
||||
1. 🔥 PDF/PNG/JPGファイルからのキー情報抽出において、前世代に比べて**15ポイントの精度向上**。
|
||||
2. 💻 **ERNIE 4.5 Turbo**をネイティブサポートし、PaddleNLP、Ollama、vLLMなどを介した大規模モデルのデプロイメントとの互換性あり。
|
||||
3. 🤝 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2) と統合し、印刷テキスト、手書き文字、印鑑、テーブル、グラフなど、複雑な文書内の一般的な要素の抽出と理解をサポート。
|
||||
|
||||
[更新履歴](https://paddlepaddle.github.io/PaddleOCR/latest/ja/update/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ クイックスタート
|
||||
### 1. オンラインデモの実行
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. インストール
|
||||
|
||||
[インストールガイド](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html) を参照してPaddlePaddleをインストールした後、PaddleOCRツールキットをインストールします。
|
||||
|
||||
```bash
|
||||
# paddleocrのインストール
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. CLIによる推論の実行
|
||||
```bash
|
||||
# PP-OCRv5の推論を実行
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# PP-StructureV3の推論を実行
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# 最初にQianfan APIキーを取得し、その後PP-ChatOCRv4の推論を実行
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# "paddleocr ocr" の詳細情報を取得
|
||||
paddleocr ocr --help
|
||||
```
|
||||
|
||||
### 4. APIによる推論の実行
|
||||
**4.1 PP-OCRv5の例**
|
||||
```python
|
||||
# PaddleOCRインスタンスの初期化
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# サンプル画像でOCR推論を実行
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# 結果を可視化し、JSON形式で保存
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 PP-StructureV3の例</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# 画像の場合
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# 結果を可視化し、JSON形式で保存
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 PP-ChatOCRv4の例</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# マルチモーダル大規模モデルを使用する場合、ローカルmllmサービスを起動する必要があります。ドキュメント:https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md を参照してデプロイを行い、mllm_chat_bot_config設定を更新してください。
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## ⛰️ 上級チュートリアル
|
||||
- [PP-OCRv5 チュートリアル](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [PP-StructureV3 チュートリアル](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [PP-ChatOCRv4 チュートリアル](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 実行結果のクイックレビュー
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 コミュニティ
|
||||
|
||||
| PaddlePaddle WeChat公式アカウント | 技術ディスカッショングループへの参加 |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
|
||||
## 😃 PaddleOCRを活用した素晴らしいプロジェクト
|
||||
PaddleOCRは、その素晴らしいコミュニティなしでは今日の姿にはなりえませんでした!💗長年のパートナー、新しい協力者、そしてPaddleOCRに情熱を注いでくださったすべての方々に心から感謝申し上げます。皆様のサポートが私たちの原動力です!
|
||||
|
||||
| プロジェクト名 | 概要 |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|詳細なドキュメント理解に基づくRAGエンジン。|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|複数タイプのドキュメントからMarkdownへの変換ツール|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|無料、オープンソースのバッチオフラインOCRソフトウェア。|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: 純粋なビジョンベースのGUIエージェントのための画面解析ツール。|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |あらゆるものに基づいた質疑応答。|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|複雑で多様なPDFドキュメントから高品質なコンテンツを効率的に抽出するために設計された強力なオープンソースツールキット。|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |画面上のテキストを認識し、翻訳して、リアルタイムで翻訳結果を表示します。|
|
||||
| [他のプロジェクトを見る](./awesome_projects.md) | [PaddleOCRをベースにした他のプロジェクト](./awesome_projects.md)|
|
||||
|
||||
## 👩👩👧👦 貢献者
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 ライセンス
|
||||
このプロジェクトは[Apache 2.0 license](LICENSE)の下で公開されています。
|
||||
|
||||
## 🎓 引用
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
340
PaddleOCR-3.1.0/README_ko.md
Normal file
340
PaddleOCR-3.1.0/README_ko.md
Normal file
@ -0,0 +1,340 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner.png" alt="PaddleOCR 배너">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | 한국어 | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 소개
|
||||
PaddleOCR은 출시 이후 최첨단 알고리즘(algorithm)과 실제 애플리케이션(application)에서의 입증된 성능 덕분에 학계, 산업계, 연구 커뮤니티에서 폭넓은 찬사를 받아왔습니다. Umi-OCR, OmniParser, MinerU, RAGFlow와 같은 유명 오픈소스 프로젝트에 이미 적용되어 전 세계 개발자(developer)들에게 필수 OCR 툴킷(toolkit)으로 자리 잡았습니다.
|
||||
|
||||
2025년 5월 20일, PaddlePaddle 팀은 **PaddlePaddle 3.0** 프레임워크의 공식 릴리스와 완전히 호환되는 PaddleOCR 3.0을 발표했습니다. 이 업데이트는 **텍스트 인식 정확도를 더욱 향상**시키고, **다중 텍스트 유형 인식** 및 **필기 인식**을 지원하며, 대규모 모델 애플리케이션의 **복잡한 문서의 고정밀 구문 분석**에 대한 증가하는 수요를 충족합니다. **ERNIE 4.5 Turbo**와 결합하면 주요 정보 추출 정확도가 크게 향상됩니다. 사용 설명서 전체는 [PaddleOCR 3.0 문서](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)를 참조하십시오.
|
||||
|
||||
PaddleOCR 3.0의 세 가지 주요 신규 기능:
|
||||
- 범용 장면 텍스트 인식 모델(Universal-Scene Text Recognition Model) [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): 다섯 가지 다른 텍스트 유형과 복잡한 필기체를 처리하는 단일 모델입니다. 전체 인식 정확도는 이전 세대보다 13%p 향상되었습니다. [온라인 체험](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
|
||||
- 일반 문서 파싱(parsing) 솔루션 [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): 다중 레이아웃(multi-layout), 다중 장면 PDF의 고정밀 파싱(parsing)을 제공하며, 공개 벤치마크(benchmark)에서 많은 오픈 소스 및 클로즈드 소스 솔루션을 능가합니다. [온라인 체험](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
|
||||
- 지능형 문서 이해 솔루션 [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): ERNIE 4.5 Turbo에 의해 네이티브로 구동되며, 이전 모델보다 15%p 높은 정확도를 달성합니다. [온라인 체험](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
PaddleOCR 3.0은 뛰어난 모델 라이브러리(model library)를 제공할 뿐만 아니라 모델 훈련, 추론 및 서비스 배포를 포괄하는 사용하기 쉬운 도구를 제공하여 개발자가 AI 애플리케이션을 신속하게 상용화할 수 있도록 지원합니다.
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch.png" alt="PaddleOCR 아키텍처">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
## 📣 최신 업데이트
|
||||
|
||||
#### **2025.06.29: PaddleOCR 3.1.0 출시**, 주요 내용:
|
||||
|
||||
- **주요 모델 및 파이프라인:**
|
||||
- **PP-OCRv5 다국어 텍스트 인식 모델 추가**, 프랑스어, 스페인어, 포르투갈어, 러시아어, 한국어 등 37개 언어의 텍스트 인식 모델 학습 및 추론 지원. **평균 정확도 30% 이상 향상.** [자세히 보기](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- PP-StructureV3의 **PP-Chart2Table 모델 업그레이드**, 차트에서 표로 변환하는 기능이 더욱 향상됨. 내부 커스텀 평가 세트에서 지표(RMS-F1)가 **9.36%p 상승(71.24% → 80.60%)**.
|
||||
- PP-StructureV3 및 ERNIE 4.5 Turbo 기반 **문서 번역 파이프라인 PP-DocTranslation 신규 출시**, Markdown 형식 문서, 다양한 복잡 레이아웃의 PDF 문서, 문서 이미지를 번역 지원, 결과는 Markdown 형식으로 저장 가능. [자세히 보기](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
- **새로운 MCP 서버:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
|
||||
- **OCR 및 PP-StructureV3 파이프라인을 모두 지원합니다.**
|
||||
- 로컬 Python 라이브러리, AIStudio 커뮤니티 클라우드 서비스, 자체 호스팅 서비스의 세 가지 작업 모드를 지원합니다.
|
||||
- stdio를 통해 로컬 서비스를 호출하고, Streamable HTTP를 통해 원격 서비스를 호출할 수 있습니다.
|
||||
|
||||
- **문서 최적화:** 일부 사용자 가이드 설명 개선으로 읽기 경험 향상.
|
||||
|
||||
#### **2025.06.26: PaddleOCR 3.0.3 릴리스, 포함 내용:**
|
||||
|
||||
- 버그 수정: `enable_mkldnn` 매개변수가 작동하지 않는 문제를 해결하고, CPU가 기본적으로 MKL-DNN 추론을 사용하는 동작을 복원했습니다.
|
||||
|
||||
#### **🔥🔥 2025.06.19: PaddleOCR 3.0.2 릴리스, 포함 내용:**
|
||||
|
||||
- **새로운 기능:**
|
||||
- 모델 기본 다운로드 소스가 `BOS`에서 `HuggingFace`로 변경되었습니다. 사용자는 환경 변수 `PADDLE_PDX_MODEL_SOURCE`를 `BOS`로 설정하여 모델 다운로드 소스를 Baidu Object Storage(BOS)로 되돌릴 수 있습니다.
|
||||
- PP-OCRv5, PP-StructureV3, PP-ChatOCRv4 파이프라인에 대해 C++, Java, Go, C#, Node.js, PHP 6개 언어의 서비스 호출 예제가 추가되었습니다.
|
||||
- PP-StructureV3 파이프라인의 레이아웃 파티션 정렬 알고리즘을 개선하여 복잡한 세로 레이아웃의 정렬 논리를 향상했습니다.
|
||||
- 언어(`lang`)만 지정하고 모델 버전을 명시하지 않은 경우, 해당 언어를 지원하는 최신 모델 버전을 자동으로 선택하도록 모델 선택 로직을 강화했습니다.
|
||||
- MKL-DNN 캐시 크기에 기본 상한을 설정하여 무한 확장을 방지하고, 사용자 정의 캐시 용량 설정을 지원합니다.
|
||||
- 고성능 추론의 기본 구성을 업데이트하여 Paddle MKL-DNN 가속을 지원하고, 자동 구성 선택 로직을 최적화했습니다.
|
||||
- 설치된 Paddle 프레임워크가 지원하는 실제 디바이스를 고려하도록 기본 디바이스 선택 로직을 조정했습니다.
|
||||
- PP-OCRv5의 Android 예제가 추가되었습니다. [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
|
||||
|
||||
- **버그 수정:**
|
||||
- PP-StructureV3 일부 CLI 파라미터가 적용되지 않던 문제를 수정했습니다.
|
||||
- `export_paddlex_config_to_yaml`가 특정 상황에서 정상 동작하지 않던 문제를 해결했습니다.
|
||||
- `save_path`의 실제 동작과 문서 설명이 일치하지 않던 문제를 수정했습니다.
|
||||
- 기본 서비스화 배포에서 MKL-DNN을 사용할 때 발생할 수 있는 다중 스레딩 오류를 수정했습니다.
|
||||
- Latex-OCR 모델의 이미지 전처리 과정에서 채널 순서 오류를 수정했습니다.
|
||||
- 텍스트 인식 모듈에서 시각화 이미지를 저장할 때 발생하던 채널 순서 오류를 수정했습니다.
|
||||
- PP-StructureV3 파이프라인의 표 시각화 결과에 발생하던 채널 순서 오류를 수정했습니다.
|
||||
- PP-StructureV3 파이프라인에서 특수한 상황에서 `overlap_ratio` 계산 시 발생하던 오버플로 문제를 수정했습니다.
|
||||
|
||||
- **문서 개선:**
|
||||
- 문서의 `enable_mkldnn` 파라미터 설명을 프로그램의 실제 동작에 맞게 업데이트했습니다.
|
||||
- `lang` 및 `ocr_version` 파라미터에 대한 문서 오류를 수정했습니다.
|
||||
- CLI를 통해 생산 라인 설정 파일을 내보내는 방법을 문서에 추가했습니다.
|
||||
- PP-OCRv5 성능 데이터 표에서 누락된 열을 복원했습니다.
|
||||
- 다양한 구성에서 PP-StructureV3의 벤치마크 지표를 개선했습니다.
|
||||
|
||||
- **기타:**
|
||||
- numpy, pandas 등 의존성 버전 제한을 완화하여 Python 3.12 지원을 복원했습니다.
|
||||
|
||||
#### **🔥🔥 2025.06.05: PaddleOCR 3.0.1 릴리스, 포함 내용:**
|
||||
|
||||
- **일부 모델 및 모델 구성 최적화:**
|
||||
- PP-OCRv5의 기본 모델 구성을 업데이트하여 탐지 및 인식을 모두 mobile에서 server 모델로 변경했습니다. 대부분의 시나리오에서 기본 성능을 향상시키기 위해 구성의 `limit_side_len` 파라미터(parameter)가 736에서 64로 변경되었습니다.
|
||||
- 99.42%의 정확도를 가진 새로운 텍스트 라인 방향 분류 모델 `PP-LCNet_x1_0_textline_ori`를 추가했습니다. OCR, PP-StructureV3, PP-ChatOCRv4 파이프라인의 기본 텍스트 라인 방향 분류기가 이 모델로 업데이트되었습니다.
|
||||
- 텍스트 라인 방향 분류 모델 `PP-LCNet_x0_25_textline_ori`를 최적화하여 정확도를 3.3%p 향상시켜 현재 정확도는 98.85%입니다.
|
||||
|
||||
- **버전 3.0.0의 일부 문제점에 대한 최적화 및 수정, [상세 정보](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
|
||||
|
||||
🔥🔥2025.05.20: **PaddleOCR v3.0** 정식 출시, 포함 내용:
|
||||
- **PP-OCRv5**: 모든 시나리오를 위한 고정밀 텍스트 인식 모델 - 이미지/PDF에서 즉시 텍스트 추출.
|
||||
1. 🌐 단일 모델로 **다섯 가지** 텍스트 유형 지원 - **중국어 간체, 중국어 번체, 중국어 간체 병음, 영어, 일본어**를 단일 모델 내에서 원활하게 처리합니다.
|
||||
2. ✍️ 향상된 **필기체 인식**: 복잡한 흘림체 및 비표준 필기체에서 성능이 크게 향상되었습니다.
|
||||
3. 🎯 PP-OCRv4에 비해 **정확도 13%p 향상**, 다양한 실제 시나리오에서 SOTA(state-of-the-art) 성능을 달성했습니다.
|
||||
|
||||
- **PP-StructureV3**: 범용 문서 파싱(parsing) – 실제 시나리오를 위한 SOTA 이미지/PDF 파싱(parsing) 성능!
|
||||
1. 🧮 **고정밀 다중 장면 PDF 파싱(parsing)**, OmniDocBench 벤치마크(benchmark)에서 오픈 소스 및 클로즈드 소스 솔루션을 모두 능가합니다.
|
||||
2. 🧠 전문 기능에는 **도장 인식**, **차트-표 변환**, **중첩된 수식/이미지가 있는 표 인식**, **세로 텍스트 문서 파싱(parsing)**, **복잡한 표 구조 분석** 등이 포함됩니다.
|
||||
|
||||
- **PP-ChatOCRv4**: 지능형 문서 이해 – 이미지/PDF에서 단순한 텍스트가 아닌 핵심 정보 추출.
|
||||
1. 🔥 이전 세대에 비해 PDF/PNG/JPG 파일의 핵심 정보 추출에서 **정확도 15%p 향상**.
|
||||
2. 💻 **ERNIE 4.5 Turbo** 기본 지원, PaddleNLP, Ollama, vLLM 등을 통한 대규모 모델 배포와 호환됩니다.
|
||||
3. 🤝 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2)와 통합되어 인쇄된 텍스트, 필기체, 도장, 표, 차트 등 복잡한 문서의 일반적인 요소 추출 및 이해를 지원합니다.
|
||||
|
||||
[히스토리 로그](https://paddlepaddle.github.io/PaddleOCR/latest/en/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ 빠른 시작
|
||||
### 1. 온라인 데모 실행
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. 설치
|
||||
|
||||
[설치 가이드](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)를 참조하여 PaddlePaddle을 설치한 후, PaddleOCR 툴킷을 설치하십시오.
|
||||
|
||||
```bash
|
||||
# paddleocr 설치
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. CLI를 통한 추론 실행
|
||||
```bash
|
||||
# PP-OCRv5 추론 실행
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# PP-StructureV3 추론 실행
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# 먼저 Qianfan API 키를 받고, PP-ChatOCRv4 추론 실행
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# "paddleocr ocr"에 대한 추가 정보 얻기
|
||||
paddleocr ocr --help
|
||||
```
|
||||
|
||||
### 4. API를 통한 추론 실행
|
||||
**4.1 PP-OCRv5 예제**
|
||||
```python
|
||||
from paddleocr import PaddleOCR
|
||||
# PaddleOCR 인스턴스 초기화
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# 샘플 이미지에 대해 OCR 추론 실행
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# 결과 시각화 및 JSON 결과 저장
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 PP-StructureV3 예제</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# 이미지용
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# 결과 시각화 및 JSON 결과 저장
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 PP-ChatOCRv4 예제</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# 다중 모드 대형 모델을 사용하는 경우 로컬 mllm 서비스를 시작해야 합니다. 문서: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md를 참조하여 배포하고 mllm_chat_bot_config 구성을 업데이트할 수 있습니다.
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## ⛰️ 고급 튜토리얼
|
||||
- [PP-OCRv5 튜토리얼](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [PP-StructureV3 튜토리얼](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [PP-ChatOCRv4 튜토리얼](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 실행 결과 빠른 개요
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 데모">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 데모">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 커뮤니티
|
||||
|
||||
| PaddlePaddle 위챗(WeChat) 공식 계정 | 기술 토론 그룹 가입 |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
|
||||
## 🏆 PaddleOCR을 활용하는 우수 프로젝트
|
||||
PaddleOCR의 발전은 커뮤니티 없이는 불가능합니다! 💗 오랜 파트너, 새로운 협력자, 그리고 이름을 언급했든 안 했든 PaddleOCR에 열정을 쏟아부은 모든 분들께 진심으로 감사드립니다. 여러분의 지원이 우리의 원동력입니다!
|
||||
|
||||
| 프로젝트 이름 | 설명 |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|심층 문서 이해 기반의 RAG 엔진.|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|다중 유형 문서를 마크다운(Markdown)으로 변환하는 도구|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|무료, 오픈 소스, 배치 오프라인 OCR 소프트웨어.|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |순수 비전 기반 GUI 에이전트를 위한 화면 파싱(parsing) 도구.|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |무엇이든 기반으로 한 질의응답 시스템.|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|복잡하고 다양한 PDF 문서에서 고품질 콘텐츠를 효율적으로 추출하도록 설계된 강력한 오픈 소스 툴킷.|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |화면의 텍스트를 인식하여 번역하고 번역 결과를 실시간으로 표시합니다.|
|
||||
| [Learn more projects](./awesome_projects.md) | [More projects based on PaddleOCR](./awesome_projects.md)|
|
||||
|
||||
## 👩👩👧👦 기여자
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 라이선스
|
||||
이 프로젝트는 [Apache 2.0 license](LICENSE)에 따라 배포됩니다.
|
||||
|
||||
## 🎓 인용
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
348
PaddleOCR-3.1.0/README_ru.md
Normal file
348
PaddleOCR-3.1.0/README_ru.md
Normal file
@ -0,0 +1,348 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner.png" alt="Баннер PaddleOCR">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
[English](./README.md) | [简体中文](./README_cn.md) | [繁體中文](./README_tcn.md) | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | Русский | [Español](./README_es.md) | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 Введение
|
||||
С момента своего первого выпуска PaddleOCR получил широкое признание в академических, промышленных и исследовательских кругах благодаря своим передовым алгоритмам и доказанной производительности в реальных приложениях. Он уже используется в таких популярных проектах с открытым исходным кодом, как Umi-OCR, OmniParser, MinerU и RAGFlow, что делает его предпочтительным инструментарием OCR для разработчиков по всему миру.
|
||||
|
||||
20 мая 2025 года команда PaddlePaddle представила PaddleOCR 3.0, полностью совместимый с официальным выпуском фреймворка **PaddlePaddle 3.0**. Это обновление еще больше **повышает точность распознавания текста**, добавляет поддержку **распознавания нескольких типов текста** и **распознавания рукописного текста**, а также удовлетворяет растущий спрос на приложения с большими моделями для **высокоточного анализа сложных документов**. В сочетании с **ERNIE 4.5 Turbo** он значительно улучшает точность извлечения ключевой информации. PaddleOCR 3.0 также вводит поддержку китайских гетерогенных AI ускорителей, таких как **KUNLUNXIN** и **Ascend**. Для получения полной документации по использованию, пожалуйста, обратитесь к [Документации PaddleOCR 3.0](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html).
|
||||
|
||||
Три новые ключевые функции в PaddleOCR 3.0:
|
||||
- Универсальная модель распознавания текста в любых сценах [PP-OCRv5](./docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.en.md): Одна модель обрабатывает пять различных типов текста и сложный рукописный ввод. Общая точность распознавания увеличилась на 13 процентных пунктов по сравнению с предыдущим поколением. [Онлайн-демо](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
|
||||
- Общее решение для парсинга документов [PP-StructureV3](./docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.en.md): Обеспечивает высокоточный парсинг PDF-файлов с различными макетами и сценариями, превосходя многие решения с открытым и закрытым исходным кодом по результатам публичных тестов. [Онлайн-демо](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
|
||||
- Интеллектуальное решение для понимания документов [PP-ChatOCRv4](./docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.en.md): Нативно поддерживается большим моделью ERNIE 4.5 Turbo, достигая на 15 процентных пунктов более высокой точности, чем его предшественник. [Онлайн-демо](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
Помимо предоставления выдающейся библиотеки моделей, PaddleOCR 3.0 также предлагает удобные инструменты, охватывающие обучение моделей, инференс и развертывание сервисов, чтобы разработчики могли быстро внедрять ИИ-приложения в производство.
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch.png" alt="Архитектура PaddleOCR">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
## 📣 Последние обновления
|
||||
|
||||
#### **2025.06.29: Выпуск PaddleOCR 3.1.0**, включает:
|
||||
|
||||
- **Основные модели и пайплайны:**
|
||||
- **Добавлена многоязычная модель распознавания текста PP-OCRv5**, поддерживающая обучение и инференс для 37 языков, включая французский, испанский, португальский, русский, корейский и др. **Средняя точность увеличилась более чем на 30%.** [Подробнее](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- Обновлена **модель PP-Chart2Table** в PP-StructureV3, что еще больше улучшило преобразование графиков в таблицы. На внутренних пользовательских тестах метрика (RMS-F1) **увеличилась на 9,36 процентных пункта (71,24% -> 80,60%).**
|
||||
- Запущен новый **конвейер перевода документов PP-DocTranslation на основе PP-StructureV3 и ERNIE 4.5 Turbo**, поддерживающий перевод документов в формате Markdown, различных PDF-документов со сложной версткой и изображений документов, с сохранением результата в формате Markdown. [Подробнее](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
- **Новый сервер MCP:** [Details](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/mcp_server.html)
|
||||
- **Поддерживает как OCR, так и конвейеры PP-StructureV3.**
|
||||
- Поддерживаются три режима работы: локальная библиотека Python, облачный сервис сообщества AIStudio и самостоятельный хостинг.
|
||||
- Поддерживается вызов локальных сервисов через stdio и удалённых сервисов через Streamable HTTP.
|
||||
|
||||
- **Оптимизация документации:** Улучшены описания в некоторых руководствах пользователя для более комфортного чтения.
|
||||
|
||||
|
||||
#### **2025.06.26: Релиз PaddleOCR 3.0.3, включает:**
|
||||
|
||||
- Исправление ошибки: Исправлена проблема, из-за которой параметр `enable_mkldnn` не действовал, восстановлено поведение использования MKL-DNN для вывода на CPU по умолчанию.
|
||||
|
||||
#### 🔥🔥**2025.06.19: Релиз PaddleOCR 3.0.2, включает:**
|
||||
|
||||
- **Новые возможности:**
|
||||
- Источник загрузки по умолчанию изменен с `BOS` на `HuggingFace`. Пользователи также могут изменить переменную окружения `PADDLE_PDX_MODEL_SOURCE` на `BOS`, чтобы установить источник загрузки моделей обратно на Baidu Object Storage (BOS).
|
||||
- Добавлены примеры вызова сервисов для шести языков — C++, Java, Go, C#, Node.js и PHP — для пайплайнов, таких как PP-OCRv5, PP-StructureV3 и PP-ChatOCRv4.
|
||||
- Улучшен алгоритм сортировки разделов макета в пайплайне PP-StructureV3, улучшена логика сортировки для сложных вертикальных макетов для достижения лучших результатов.
|
||||
- Улучшена логика выбора модели: когда указан язык, но не указана версия модели, система автоматически выберет последнюю версию модели, поддерживающую этот язык.
|
||||
- Установлен верхний предел по умолчанию для размера кэша MKL-DNN для предотвращения неограниченного роста, а также предоставлена пользователям возможность настраивать емкость кэша.
|
||||
- Обновлены конфигурации по умолчанию для высокопроизводительного инференса для поддержки ускорения Paddle MKL-DNN и оптимизирована логика автоматического выбора конфигурации для более разумного выбора.
|
||||
- Скорректирована логика получения устройства по умолчанию для учета фактической поддержки вычислительных устройств установленным фреймворком Paddle, что делает поведение программы более интуитивным.
|
||||
- Добавлен пример для Android для PP-OCRv5. [Подробности](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/deployment/on_device_deployment.html).
|
||||
|
||||
- **Исправления ошибок:**
|
||||
- Исправлена проблема с некоторыми параметрами CLI в PP-StructureV3, которые не вступали в силу.
|
||||
- Решена проблема, из-за которой `export_paddlex_config_to_yaml` в некоторых случаях работала некорректно.
|
||||
- Исправлено несоответствие между фактическим поведением `save_path` и его описанием в документации.
|
||||
- Исправлены потенциальные ошибки многопоточности при использовании MKL-DNN в базовом развертывании сервиса.
|
||||
- Исправлены ошибки порядка каналов в предварительной обработке изображений для модели Latex-OCR.
|
||||
- Исправлены ошибки порядка каналов при сохранении визуализированных изображений в модуле распознавания текста.
|
||||
- Решены ошибки порядка каналов в визуализированных результатах таблиц в пайплайне PP-StructureV3.
|
||||
- Исправлена проблема переполнения при вычислении `overlap_ratio` в крайне особых обстоятельствах в пайплайне PP-StructureV3.
|
||||
|
||||
- **Улучшения документации:**
|
||||
- Обновлено описание параметра `enable_mkldnn` в документации, чтобы оно точно отражало фактическое поведение программы.
|
||||
- Исправлены ошибки в документации, касающиеся параметров `lang` и `ocr_version`.
|
||||
- Добавлены инструкции по экспорту файлов конфигурации производственной линии через CLI.
|
||||
- Исправлены отсутствующие столбцы в таблице данных о производительности для PP-OCRv5.
|
||||
- Уточнены метрики бенчмарков для PP-StructureV3 для различных конфигураций.
|
||||
|
||||
- **Прочее:**
|
||||
- Ослаблены ограничения версий для зависимостей, таких как numpy и pandas, восстановлена поддержка Python 3.12.
|
||||
|
||||
<details>
|
||||
<summary><strong>История обновлений</strong></summary>
|
||||
|
||||
#### **🔥🔥 2025.06.05: Релиз PaddleOCR 3.0.1, включает:**
|
||||
|
||||
- **Оптимизация некоторых моделей и их конфигураций:**
|
||||
- Обновлена конфигурация модели по умолчанию для PP-OCRv5: модели обнаружения и распознавания изменены с `mobile` на `server`. Для улучшения производительности по умолчанию в большинстве сценариев параметр `limit_side_len` в конфигурации изменен с 736 на 64.
|
||||
- Добавлена новая модель классификации ориентации строк текста `PP-LCNet_x1_0_textline_ori` с точностью 99.42%. Классификатор ориентации строк текста по умолчанию для пайплайнов OCR, PP-StructureV3 и PP-ChatOCRv4 обновлен до этой модели.
|
||||
- Оптимизирована модель классификации ориентации строк текста `PP-LCNet_x0_25_textline_ori`, точность улучшена на 3.3 процентных пункта до текущего значения 98.85%.
|
||||
|
||||
- **Оптимизации и исправления некоторых проблем версии 3.0.0, [подробности](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)**
|
||||
|
||||
🔥🔥2025.05.20: Официальный релиз **PaddleOCR v3.0**, включающий:
|
||||
- **PP-OCRv5**: Высокоточная модель распознавания текста для всех сценариев - Мгновенное извлечение текста из изображений/PDF.
|
||||
1. 🌐 Поддержка **пяти** типов текста в одной модели - Бесшовная обработка **упрощенного китайского, традиционного китайского, пиньиня, английского** и **японского** в рамках одной модели.
|
||||
2. ✍️ Улучшенное **распознавание рукописного текста**: Значительно лучше справляется со сложными слитными и нестандартными почерками.
|
||||
3. 🎯 **Прирост точности на 13 процентных пунктов** по сравнению с PP-OCRv4, достижение самых современных результатов в различных реальных сценариях.
|
||||
|
||||
- **PP-StructureV3**: Универсальный парсинг документов – Используйте SOTA парсинг изображений/PDF для реальных сценариев!
|
||||
1. 🧮 **Высокоточный парсинг PDF в различных сценариях**, опережающий как открытые, так и закрытые решения на бенчмарке OmniDocBench.
|
||||
2. 🧠 Специализированные возможности включают **распознавание печатей**, **преобразование диаграмм в таблицы**, **распознавание таблиц с вложенными формулами/изображениями**, **парсинг документов с вертикальным текстом** и **анализ сложных структур таблиц**.
|
||||
|
||||
- **PP-ChatOCRv4**: Интеллектуальное понимание документов – Извлекайте ключевую информацию, а не просто текст из изображений/PDF.
|
||||
1. 🔥 **Прирост точности на 15 процентных пунктов** в извлечении ключевой информации из файлов PDF/PNG/JPG по сравнению с предыдущим поколением.
|
||||
2. 💻 Нативная поддержка **ERNIE 4.5 Turbo**, с совместимостью для развертывания больших моделей через PaddleNLP, Ollama, vLLM и другие.
|
||||
3. 🤝 Интегрирован [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2), обеспечивающий извлечение и понимание печатного текста, рукописного текста, печатей, таблиц, диаграмм и других общих элементов в сложных документах.
|
||||
|
||||
[История обновлений](https://paddlepaddle.github.io/PaddleOCR/latest/en/update/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ Быстрый старт
|
||||
### 1. Запустить онлайн-демо
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. Установка
|
||||
|
||||
Установите PaddlePaddle, следуя [Руководству по установке](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html), после чего установите инструментарий PaddleOCR.
|
||||
|
||||
```bash
|
||||
# Установить paddleocr
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. Запуск инференса через CLI
|
||||
```bash
|
||||
# Запустить инференс PP-OCRv5
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# Запустить инференс PP-StructureV3
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Сначала получите Qianfan API Key, а затем запустите инференс PP-ChatOCRv4
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 驾驶室准乘人数 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# Получить больше информации о "paddleocr ocr"
|
||||
paddleocr ocr --help
|
||||
```
|
||||
|
||||
### 4. Запуск инференса через API
|
||||
**4.1 Пример для PP-OCRv5**
|
||||
```python
|
||||
# Инициализация экземпляра PaddleOCR
|
||||
from paddleocr import PaddleOCR
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# Запуск инференса OCR на примере изображения
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# Визуализация результатов и сохранение в формате JSON
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 Пример для PP-StructureV3</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# Для изображений
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# Визуализация результатов и сохранение в формате JSON
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 Пример для PP-ChatOCRv4</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # ваш api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # ваш api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# Если используется мультимодальная большая модель, необходимо запустить локальный сервис mllm. Вы можете обратиться к документации: https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.en.md для выполнения развертывания и обновления конфигурации mllm_chat_bot_config.
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # URL вашего локального сервиса mllm
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # ваш api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### 5. Китайские гетерогенные ИИ-ускорители
|
||||
- [Huawei Ascend](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
|
||||
- [KUNLUNXIN](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
|
||||
|
||||
## ⛰️ Продвинутые руководства
|
||||
- [Руководство по PP-OCRv5](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [Руководство по PP-StructureV3](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [Руководство по PP-ChatOCRv4](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 Краткий обзор результатов выполнения
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="Демо PP-OCRv5">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="Демо PP-StructureV3">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 Сообщество
|
||||
|
||||
| Официальный аккаунт PaddlePaddle в WeChat | Присоединяйтесь к группе для технических обсуждений |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
|
||||
## 😃 Потрясающие проекты, использующие PaddleOCR
|
||||
PaddleOCR не был бы там, где он есть сегодня, без своего невероятного сообщества! 💗 Огромное спасибо всем нашим давним партнерам, новым сотрудникам и всем, кто вложил свою страсть в PaddleOCR — независимо от того, назвали мы вас или нет. Ваша поддержка разжигает наш огонь!
|
||||
|
||||
| Название проекта | Описание |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|RAG-движок, основанный на глубоком понимании документов.|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|Инструмент для преобразования документов различных типов в Markdown|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Бесплатное офлайн-программное обеспечение для пакетного OCR с открытым исходным кодом.|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |Инструмент парсинга экрана для GUI-агента, основанного исключительно на компьютерном зрении.|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Система вопросов и ответов на основе любого контента.|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|Мощный инструментарий с открытым исходным кодом, предназначенный для эффективного извлечения высококачественного контента из сложных и разнообразных PDF-документов.|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |Распознает текст на экране, переводит его и отображает результаты перевода в режиме реального времени.|
|
||||
| [Узнать больше о проектах](./awesome_projects.md) | [Больше проектов на основе PaddleOCR](./awesome_projects.md)|
|
||||
|
||||
## 👩👩👧👦 Контрибьюторы
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 Лицензия
|
||||
Этот проект выпущен под [лицензией Apache 2.0](LICENSE).
|
||||
|
||||
## 🎓 Цитирование
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
345
PaddleOCR-3.1.0/README_tcn.md
Normal file
345
PaddleOCR-3.1.0/README_tcn.md
Normal file
@ -0,0 +1,345 @@
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Banner_cn.png" alt="PaddleOCR 橫幅">
|
||||
</p>
|
||||
|
||||
<!-- language -->
|
||||
[English](./README.md) | [简体中文](./README_cn.md) | 繁體中文 | [日本語](./README_ja.md) | [한국어](./README_ko.md) | [Français](./README_fr.md) | [Русский](./README_ru.md) | [Español](./README_es.md) | [العربية](./README_ar.md)
|
||||
|
||||
<!-- icon -->
|
||||
|
||||
[](https://github.com/PaddlePaddle/PaddleOCR)
|
||||
[](https://pypi.org/project/PaddleOCR/)
|
||||

|
||||

|
||||

|
||||
|
||||
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
</div>
|
||||
|
||||
## 🚀 簡介
|
||||
PaddleOCR 自發布以來,憑藉其學術前沿的演算法與產業落地實踐,深受產學研各界的喜愛,並廣泛應用於眾多知名開源專案,如 Umi-OCR、OmniParser、MinerU、RAGFlow 等,已成為廣大開發者心中開源 OCR 領域的首選工具。2025 年 5 月 20 日,飛槳團隊發布 **PaddleOCR 3.0**,全面適配**飛槳框架 3.0 正式版**,進一步**提升文字辨識精度**,支援**多種文字類型辨識**和**手寫體辨識**,滿足大型模型應用對**複雜文件高精度解析**的旺盛需求。結合**ERNIE 4.5 Turbo**,顯著提升了關鍵資訊擷取的精度,並新增**對崑崙芯、昇騰等國產硬體**的支援。完整使用說明請參閱 [PaddleOCR 3.0 文檔](https://paddlepaddle.github.io/PaddleOCR/latest/)。
|
||||
|
||||
PaddleOCR 3.0 **新增**三大特色功能:
|
||||
- 全場景文字辨識模型 [PP-OCRv5](docs/version3.x/algorithm/PP-OCRv5/PP-OCRv5.md):單一模型支援五種文字類型和複雜手寫體辨識;整體辨識精度相較前一代**提升 13 個百分點**。[線上體驗](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
- 通用文件解析方案 [PP-StructureV3](docs/version3.x/algorithm/PP-StructureV3/PP-StructureV3.md):支援多場景、多版式的 PDF 高精度解析,在公開評測集中**領先眾多開源與閉源方案**。[線上體驗](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
- 智慧文件理解方案 [PP-ChatOCRv4](docs/version3.x/algorithm/PP-ChatOCRv4/PP-ChatOCRv4.md):原生支援ERNIE 4.5 Turbo,精度相較前一代**提升 15 個百分點**。[線上體驗](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
除了提供優秀的模型庫,PaddleOCR 3.0 還提供好學易用的工具,涵蓋模型訓練、推論及服務化部署,方便開發者快速將 AI 應用落地。
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/Arch_cn.png" alt="PaddleOCR 架構">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
|
||||
## 📣 最新動態
|
||||
|
||||
**🔥🔥2025.06.29:發布 PaddleOCR 3.1.0**,內容包括:
|
||||
|
||||
- **主要模型與流程:**
|
||||
- **新增 PP-OCRv5 多語言文字識別模型**,支援包括法語、西班牙語、葡萄牙語、俄語、韓語等在內的 37 種語言的文字識別模型訓練與推理。**平均準確率提升超過 30%。** [詳情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/algorithm/PP-OCRv5/PP-OCRv5_multi_languages.html)
|
||||
- 升級了 PP-StructureV3 的 **PP-Chart2Table 模型**,進一步提升圖表轉表格能力。在內部自訂評測集上,指標(RMS-F1)**提升了 9.36 個百分點(71.24% -> 80.60%)。**
|
||||
- 新增基於 PP-StructureV3 和 ERNIE 4.5 Turbo 的**文件翻譯流程 PP-DocTranslation**,支援 Markdown 格式文件、各種複雜版面 PDF 文件及文件圖片翻譯,結果可儲存為 Markdown 格式文件。[詳情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-DocTranslation.html)
|
||||
|
||||
- **新增 MCP server:**[Details](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/mcp_server.html)
|
||||
- **支援 OCR 及 PP-StructureV3 流程。**
|
||||
- 支援三種工作模式:本地 Python 函式庫、AIStudio 社群雲端服務、自主託管服務。
|
||||
- 支援通過 stdio 調用本地服務,通過 Streamable HTTP 調用遠端服務。
|
||||
|
||||
- **文件優化:** 優化了部分使用說明文件描述,提升閱讀體驗。
|
||||
|
||||
2025.06.26: **PaddleOCR 3.0.3** 發布,包含:
|
||||
|
||||
- 錯誤修復:修復`enable_mkldnn`參數不生效的問題,恢復CPU默認使用MKL-DNN推理的行為。
|
||||
|
||||
|
||||
2025.06.19: **PaddleOCR 3.0.2** 發布,包含:
|
||||
|
||||
- **功能新增:**
|
||||
- 模型預設下載來源從`BOS`改為`HuggingFace`,同時也支援使用者透過更改環境變數`PADDLE_PDX_MODEL_SOURCE`為`BOS`,將模型下載來源設定為百度雲端物件儲存 BOS。
|
||||
- PP-OCRv5、PP-StructureV3、PP-ChatOCRv4 等 pipeline 新增 C++、Java、Go、C#、Node.js、PHP 6 種語言的服務呼叫範例。
|
||||
- 優化 PP-StructureV3 產線中版面分區排序演算法,對複雜直書版面排序邏輯進行完善,進一步提升了複雜版面排序效果。
|
||||
- 優化模型選擇邏輯,當指定語言、未指定模型版本時,自動選擇支援該語言的最新版本的模型。
|
||||
- 為 MKL-DNN 快取大小設定預設上限,防止快取無限增長。同時,支援使用者設定快取容量。
|
||||
- 更新高效能推論預設設定,支援 Paddle MKL-DNN 加速。優化高效能推論自動設定邏輯,支援更智慧的設定選擇。
|
||||
- 調整預設裝置取得邏輯,考量環境中安裝的 Paddle 框架對運算裝置的實際支援情況,使程式行為更符合直覺。
|
||||
- 新增 PP-OCRv5 的 Android 端範例,[詳情](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/deployment/on_device_deployment.html)。
|
||||
|
||||
- **錯誤修復:**
|
||||
- 修復 PP-StructureV3 部分 CLI 參數不生效的問題。
|
||||
- 修復部分情況下 `export_paddlex_config_to_yaml` 無法正常運作的問題。
|
||||
- 修復 save_path 實際行為與文件描述不符的問題。
|
||||
- 修復基礎服務化部署在使用 MKL-DNN 時可能出現的多執行緒錯誤。
|
||||
- 修復 Latex-OCR 模型的影像預處理通道順序錯誤。
|
||||
- 修復文字辨識模組儲存視覺化影像的通道順序錯誤。
|
||||
- 修復 PP-StructureV3 中表格視覺化結果通道順序錯誤。
|
||||
- 修復 PP-StructureV3 產線中極特殊情況下,計算 overlap_ratio 時,變數溢位問題。
|
||||
|
||||
- **文件優化:**
|
||||
- 更新文件中對 `enable_mkldnn` 參數的說明,使其更準確地描述程式的實際行為。
|
||||
- 修復文件中對 `lang` 和 `ocr_version` 參數描述的錯誤。
|
||||
- 補充透過 CLI 匯出產線設定檔案的說明。
|
||||
- 修復 PP-OCRv5 效能資料表格中的欄位缺失問題。
|
||||
- 潤飾 PP-StructureV3 在不同設定下的 benchmark 指標。
|
||||
|
||||
- **其他:**
|
||||
- 放寬 numpy、pandas 等依賴項的版本限制,恢復對 Python 3.12 的支援。
|
||||
|
||||
<details>
|
||||
<summary><strong>歷史日誌</strong></summary>
|
||||
|
||||
🔥🔥2025.06.05: **PaddleOCR 3.0.1** 發布,包含:
|
||||
|
||||
- **優化部分模型和模型設定:**
|
||||
- 更新 PP-OCRv5 預設模型設定,偵測和辨識模型均由 mobile 改為 server 模型。為改善多數場景下的預設效果,設定中的參數 `limit_side_len` 由 736 改為 64。
|
||||
- 新增文字行方向分類模型 `PP-LCNet_x1_0_textline_ori`,精度達 99.42%。OCR、PP-StructureV3、PP-ChatOCRv4 流程的預設文字行方向分類器已更新為此模型。
|
||||
- 優化文字行方向分類模型 `PP-LCNet_x0_25_textline_ori`,精度提升 3.3 個百分點,目前精度為 98.85%。
|
||||
- **優化及修復 3.0.0 版本的部分問題,[詳情](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)**
|
||||
|
||||
🔥🔥2025.05.20: **PaddleOCR 3.0** 正式發布,包含:
|
||||
- **PP-OCRv5**: 全場景高精度文字辨識
|
||||
|
||||
1. 🌐 單一模型支援**五種**文字類型(**簡體中文**、**繁體中文**、**中文拼音**、**英文**和**日文**)。
|
||||
2. ✍️ 支援複雜**手寫體**辨識:顯著提升對複雜連筆、非標準字跡的辨識效能。
|
||||
3. 🎯 整體辨識精度提升:在多種應用場景達到 SOTA 精度,相較於上一版 PP-OCRv4,辨識精度**提升 13 個百分點**!
|
||||
|
||||
- **PP-StructureV3**: 通用文件解析方案
|
||||
|
||||
1. 🧮 支援多場景 PDF 高精度解析,在 OmniDocBench 基準測試中**領先眾多開源與閉源方案**。
|
||||
2. 🧠 多項專業功能:**印章辨識**、**圖表轉表格**、**含嵌套公式/圖片的表格辨識**、**直書文字解析**及**複雜表格結構分析**等。
|
||||
|
||||
|
||||
- **PP-ChatOCRv4**: 智慧文件理解方案
|
||||
1. 🔥 文件影像(PDF/PNG/JPG)關鍵資訊擷取精度相較前一代**提升 15 個百分點**!
|
||||
2. 💻 原生支援**ERNIE 4.5 Turbo**,並相容 PaddleNLP、Ollama、vLLM 等工具部署的大型模型。
|
||||
3. 🤝 整合 [PP-DocBee2](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/ppdocbee2),支援印刷體、手寫體、印章、表格、圖表等複雜文件元素的資訊擷取與理解。
|
||||
|
||||
[更多日誌](https://paddlepaddle.github.io/PaddleOCR/latest/update/update.html)
|
||||
|
||||
</details>
|
||||
|
||||
## ⚡ 快速入門
|
||||
### 1. 線上體驗
|
||||
[](https://aistudio.baidu.com/community/app/91660/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518494/webUI)
|
||||
[](https://aistudio.baidu.com/community/app/518493/webUI)
|
||||
|
||||
### 2. 本機安裝
|
||||
|
||||
請參考[安裝指南](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/pip/linux-pip.html)完成 **PaddlePaddle 3.0** 的安裝,然後安裝 paddleocr。
|
||||
|
||||
```bash
|
||||
# 安裝 paddleocr
|
||||
pip install paddleocr
|
||||
```
|
||||
|
||||
### 3. 命令列推論
|
||||
```bash
|
||||
# 執行 PP-OCRv5 推論
|
||||
paddleocr ocr -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png --use_doc_orientation_classify False --use_doc_unwarping False --use_textline_orientation False
|
||||
|
||||
# 執行 PP-StructureV3 推論
|
||||
paddleocr pp_structurev3 -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# 執行 PP-ChatOCRv4 推論前,需先取得千帆 API Key
|
||||
paddleocr pp_chatocrv4_doc -i https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png -k 駕駛室准乘人數 --qianfan_api_key your_api_key --use_doc_orientation_classify False --use_doc_unwarping False
|
||||
|
||||
# 查看 "paddleocr ocr" 詳細參數
|
||||
paddleocr ocr --help
|
||||
```
|
||||
### 4. API 推論
|
||||
|
||||
**4.1 PP-OCRv5 範例**
|
||||
```python
|
||||
from paddleocr import PaddleOCR
|
||||
# 初始化 PaddleOCR 執行個體
|
||||
ocr = PaddleOCR(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False,
|
||||
use_textline_orientation=False)
|
||||
|
||||
# 對範例圖片執行 OCR 推論
|
||||
result = ocr.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_002.png")
|
||||
|
||||
# 將結果視覺化並儲存為 JSON
|
||||
for res in result:
|
||||
res.print()
|
||||
res.save_to_img("output")
|
||||
res.save_to_json("output")
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary><strong>4.2 PP-StructureV3 範例</strong></summary>
|
||||
|
||||
```python
|
||||
from pathlib import Path
|
||||
from paddleocr import PPStructureV3
|
||||
|
||||
pipeline = PPStructureV3(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
# 針對圖片
|
||||
output = pipeline.predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/pp_structure_v3_demo.png",
|
||||
)
|
||||
|
||||
# 將結果視覺化並儲存為 JSON
|
||||
for res in output:
|
||||
res.print()
|
||||
res.save_to_json(save_path="output")
|
||||
res.save_to_markdown(save_path="output")
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
<details>
|
||||
<summary><strong>4.3 PP-ChatOCRv4 範例</strong></summary>
|
||||
|
||||
```python
|
||||
from paddleocr import PPChatOCRv4Doc
|
||||
|
||||
chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "ernie-3.5-8k",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
retriever_config = {
|
||||
"module_name": "retriever",
|
||||
"model_name": "embedding-v1",
|
||||
"base_url": "https://qianfan.baidubce.com/v2",
|
||||
"api_type": "qianfan",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
pipeline = PPChatOCRv4Doc(
|
||||
use_doc_orientation_classify=False,
|
||||
use_doc_unwarping=False
|
||||
)
|
||||
|
||||
visual_predict_res = pipeline.visual_predict(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
use_common_ocr=True,
|
||||
use_seal_recognition=True,
|
||||
use_table_recognition=True,
|
||||
)
|
||||
|
||||
mllm_predict_info = None
|
||||
use_mllm = False
|
||||
# 若使用多模態大型模型,需啟動本機 mllm 服務,可參考文件:https://github.com/PaddlePaddle/PaddleX/blob/release/3.0/docs/pipeline_usage/tutorials/vlm_pipelines/doc_understanding.md 進行部署,並更新 mllm_chat_bot_config 設定。
|
||||
if use_mllm:
|
||||
mllm_chat_bot_config = {
|
||||
"module_name": "chat_bot",
|
||||
"model_name": "PP-DocBee",
|
||||
"base_url": "http://127.0.0.1:8080/", # your local mllm service url
|
||||
"api_type": "openai",
|
||||
"api_key": "api_key", # your api_key
|
||||
}
|
||||
|
||||
mllm_predict_res = pipeline.mllm_pred(
|
||||
input="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/vehicle_certificate-1.png",
|
||||
key_list=["驾驶室准乘人数"],
|
||||
mllm_chat_bot_config=mllm_chat_bot_config,
|
||||
)
|
||||
mllm_predict_info = mllm_predict_res["mllm_res"]
|
||||
|
||||
visual_info_list = []
|
||||
for res in visual_predict_res:
|
||||
visual_info_list.append(res["visual_info"])
|
||||
layout_parsing_result = res["layout_parsing_result"]
|
||||
|
||||
vector_info = pipeline.build_vector(
|
||||
visual_info_list, flag_save_bytes_vector=True, retriever_config=retriever_config
|
||||
)
|
||||
chat_result = pipeline.chat(
|
||||
key_list=["驾驶室准乘人数"],
|
||||
visual_info=visual_info_list,
|
||||
vector_info=vector_info,
|
||||
mllm_predict_info=mllm_predict_info,
|
||||
chat_bot_config=chat_bot_config,
|
||||
retriever_config=retriever_config,
|
||||
)
|
||||
print(chat_result)
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
|
||||
### 5. **國產硬體支援**
|
||||
- [崑崙芯安裝指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_XPU.html)
|
||||
- [昇騰安裝指南](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/other_devices_support/paddlepaddle_install_NPU.html)
|
||||
|
||||
## ⛰️ 進階指南
|
||||
- [PP-OCRv5 使用教學](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/OCR.html)
|
||||
- [PP-StructureV3 使用教學](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-StructureV3.html)
|
||||
- [PP-ChatOCRv4 使用教學](https://paddlepaddle.github.io/PaddleOCR/latest/version3.x/pipeline_usage/PP-ChatOCRv4.html)
|
||||
|
||||
## 🔄 效果展示
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/demo.gif" alt="PP-OCRv5 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div align="center">
|
||||
<p>
|
||||
<img width="100%" src="./docs/images/blue_v3.gif" alt="PP-StructureV3 Demo">
|
||||
</p>
|
||||
</div>
|
||||
|
||||
## 👩👩👧👦 開發者社群
|
||||
|
||||
| 掃描 QR Code 關注飛槳官方帳號 | 掃描 QR Code 加入技術交流群組 |
|
||||
| :---: | :---: |
|
||||
| <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qrcode_for_paddlepaddle_official_account.jpg" width="150"> | <img src="https://raw.githubusercontent.com/cuicheng01/PaddleX_doc_images/refs/heads/main/images/paddleocr/README/qr_code_for_the_questionnaire.jpg" width="150"> |
|
||||
|
||||
## 🏆 採用 PaddleOCR 的優秀專案
|
||||
PaddleOCR 的發展離不開社群的貢獻!💗 衷心感謝所有的開發者、合作夥伴與貢獻者!
|
||||
| 專案名稱 | 簡介 |
|
||||
| ------------ | ----------- |
|
||||
| [RAGFlow](https://github.com/infiniflow/ragflow) <a href="https://github.com/infiniflow/ragflow"><img src="https://img.shields.io/github/stars/infiniflow/ragflow"></a>|基於 RAG 的 AI 工作流引擎|
|
||||
| [MinerU](https://github.com/opendatalab/MinerU) <a href="https://github.com/opendatalab/MinerU"><img src="https://img.shields.io/github/stars/opendatalab/MinerU"></a>|多類型文件轉 Markdown 工具|
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|開源批次離線 OCR 軟體|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |基於純視覺的 GUI Agent 螢幕解析工具|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |基於任意內容的問答系統|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a>|高效複雜 PDF 文件擷取工具套件|
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator)<a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> |螢幕即時翻譯工具|
|
||||
| [更多專案](./awesome_projects.md) | |
|
||||
|
||||
## 👩👩👧👦 貢獻者
|
||||
|
||||
<a href="https://github.com/PaddlePaddle/PaddleOCR/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=PaddlePaddle/PaddleOCR&max=400&columns=20" width="800"/>
|
||||
</a>
|
||||
|
||||
|
||||
## 🌟 Star
|
||||
|
||||
[](https://star-history.com/#PaddlePaddle/PaddleOCR&Date)
|
||||
|
||||
|
||||
## 📄 授權條款
|
||||
本專案的發布受 [Apache 2.0 license](LICENSE) 授權條款認證。
|
||||
|
||||
## 🎓 學術引用
|
||||
|
||||
```
|
||||
@misc{paddleocr2020,
|
||||
title={PaddleOCR, Awesome multilingual OCR toolkits based on PaddlePaddle.},
|
||||
author={PaddlePaddle Authors},
|
||||
howpublished = {\url{https://github.com/PaddlePaddle/PaddleOCR}},
|
||||
year={2020}
|
||||
}
|
||||
```
|
||||
1
PaddleOCR-3.1.0/applications/README.md
Normal file
1
PaddleOCR-3.1.0/applications/README.md
Normal file
@ -0,0 +1 @@
|
||||
移步[docs](https://paddlepaddle.github.io/PaddleOCR/latest/applications/overview.html)
|
||||
29
PaddleOCR-3.1.0/awesome_projects.md
Normal file
29
PaddleOCR-3.1.0/awesome_projects.md
Normal file
@ -0,0 +1,29 @@
|
||||
## 😃 Awesome projects based on PaddleOCR
|
||||
💗 PaddleOCR wouldn’t be where it is today without its incredible community! A massive 🙌 thank you 🙌 to all our longtime partners, new collaborators, and everyone who’s poured their passion into PaddleOCR — whether we’ve named you or not. Your support fuels our fire! 🔥
|
||||
| Project Name | Description |
|
||||
| ------------ | ----------- |
|
||||
| [Umi-OCR](https://github.com/hiroi-sora/Umi-OCR) <a href="https://github.com/hiroi-sora/Umi-OCR"><img src="https://img.shields.io/github/stars/hiroi-sora/Umi-OCR"></a>|Free, Open-source, Batch Offline OCR Software.|
|
||||
| [LearnOpenCV](http://github.com/spmallick/learnopencv) <a href="http://github.com/spmallick/learnopencv"><img src="https://img.shields.io/github/stars/spmallick/learnopencv"></a> | code for Computer Vision, Deep learning, and AI research articles.|
|
||||
| [OmniParser](https://github.com/microsoft/OmniParser)<a href="https://github.com/microsoft/OmniParser"><img src="https://img.shields.io/github/stars/microsoft/OmniParser"></a> |OmniParser: Screen Parsing tool for Pure Vision Based GUI Agent.|
|
||||
| [QAnything](https://github.com/netease-youdao/QAnything)<a href="https://github.com/netease-youdao/QAnything"><img src="https://img.shields.io/github/stars/netease-youdao/QAnything"></a> |Question and Answer based on Anything.|
|
||||
| [PaddleHub](https://github.com/PaddlePaddle/PaddleHub)<a href="https://github.com/PaddlePaddle/PaddleHub"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleHub"></a> |400+ AI Models: Rich, high-quality AI models, including CV, NLP, Speech, Video and Cross-Modal.|
|
||||
| [PaddleNLP](https://github.com/PaddlePaddle/PaddleNLP)<a href="https://github.com/PaddlePaddle/PaddleNLP"><img src="https://img.shields.io/github/stars/PaddlePaddle/PaddleNLP"></a> |A Large Language Model (LLM) development suite based on the PaddlePaddle.|
|
||||
| [Rerun](https://github.com/rerun-io/rerun) <a href="https://github.com/rerun-io/rerun"><img src="https://img.shields.io/github/stars/rerun-io/rerun"></a> | Rerun is building the multimodal data stack to model, ingest, store, query and view robotics-style data |
|
||||
| [Dango-Translator](https://github.com/PantsuDango/Dango-Translator) <a href="https://github.com/PantsuDango/Dango-Translator"><img src="https://img.shields.io/github/stars/PantsuDango/Dango-Translator"></a> | Recognize text on the screen, translate it and show the translation results in real time.|
|
||||
| [PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit) <a href="https://github.com/opendatalab/PDF-Extract-Kit"><img src="https://img.shields.io/github/stars/opendatalab/PDF-Extract-Kit"></a> | PDF-Extract-Kit is a powerful open-source toolkit designed to efficiently extract high-quality content from complex and diverse PDF documents. |
|
||||
| [manga-image-translator](https://github.com/zyddnys/manga-image-translator) <a href="https://github.com/zyddnys/manga-image-translator"><img src="https://img.shields.io/github/stars/zyddnys/manga-image-translator"></a> | Translate texts in manga/images.|
|
||||
| [March7thAssistant](https://github.com/moesnow/March7thAssistant) <a href="https://github.com/moesnow/March7thAssistant"><img src="https://img.shields.io/github/stars/moesnow/March7thAssistant"></a> | Daily Tasks: Stamina recovery, daily training, claiming rewards, commissions, and farming. |
|
||||
| [PaddlePaddle/models](https://github.com/PaddlePaddle/models) <a href="https://github.com/PaddlePaddle/models"><img src="https://img.shields.io/github/stars/PaddlePaddle/models"></a> |PaddlePaddle's industrial-grade model zoo.|
|
||||
| [katanaml/sparrow](https://github.com/katanaml/sparrow) <a href="https://github.com/katanaml/sparrow"><img src="https://img.shields.io/github/stars/katanaml/sparrow"></a> | Sparrow is an innovative open-source solution for efficient data extraction and processing from various documents and images. |
|
||||
| [RapidOCR](https://github.com/RapidAI/RapidOCR) <a href="https://github.com/RapidAI/RapidOCR"><img src="https://img.shields.io/github/stars/RapidAI/RapidOCR"></a> | Awesome OCR multiple programing languages toolkits based on ONNXRuntime, OpenVINO, PaddlePaddle and PyTorch |
|
||||
| [autoMate](https://github.com/yuruotong1/autoMate) <a href="https://github.com/yuruotong1/autoMate"><img src="https://img.shields.io/github/stars/yuruotong1/autoMate"></a> | AI-Powered Local Automation Tool & Let Your Computer Work for You. |
|
||||
| [Agent-S](https://github.com/simular-ai/Agent-S) <a href="https://github.com/simular-ai/Agent-S"><img src="https://img.shields.io/github/stars/simular-ai/Agent-S"></a> | A Compositional Generalist-Specialist Framework for Computer Use Agents. |
|
||||
| [pdf-craft](https://github.com/oomol-lab/pdf-craft) <a href="https://github.com/oomol-lab/pdf-craft"><img src="https://img.shields.io/github/stars/oomol-lab/pdf-craft"></a> | PDF Craft can convert PDF files into various other formats. |
|
||||
| [VV](https://github.com/Cicada000/VV) <a href="https://github.com/Cicada000/VV"><img src="https://img.shields.io/github/stars/Cicada000/VV"></a> | Zhang Weiwei Quotations Search Project. |
|
||||
| [docetl](https://github.com/ucbepic/docetl) <a href="https://github.com/ucbepic/docetl"><img src="https://img.shields.io/github/stars/ucbepic/docetl"></a> | DocETL is a tool for creating and executing data processing pipelines, especially suited for complex document processing tasks. |
|
||||
| [ZenlessZoneZero-Auto](https://github.com/sMythicalBird/ZenlessZoneZero-Auto) <a href="https://github.com/sMythicalBird/ZenlessZoneZero-Auto"><img src="https://img.shields.io/github/stars/sMythicalBird/ZenlessZoneZero-Auto"></a> | Zenless Zone Zero Automation Framework. |
|
||||
| [Yuxi-Know](https://github.com/xerrors/Yuxi-Know) <a href="https://github.com/xerrors/Yuxi-Know"><img src="https://img.shields.io/github/stars/xerrors/Yuxi-Know"></a> | Knowledge graph question answering system based on LLMs. |
|
||||
| [PaddleSharp](https://github.com/sdcb/PaddleSharp) <a href="https://github.com/sdcb/PaddleSharp"><img src="https://img.shields.io/github/stars/sdcb/PaddleSharp"></a>|.NET/C# binding for Baidu paddle inference library and PaddleOCR |
|
||||
| [python-office](https://github.com/CoderWanFeng/python-office) <a href="https://github.com/CoderWanFeng/python-office"><img src="https://img.shields.io/github/stars/CoderWanFeng/python-office"></a> | Python tool for office works. |
|
||||
| [OnnxOCR](https://github.com/jingsongliujing/OnnxOCR) <a href="https://github.com/jingsongliujing/OnnxOCR"><img src="https://img.shields.io/github/stars/jingsongliujing/OnnxOCR"></a>|A lightweight OCR system based on PaddleOCR, decoupled from the PaddlePaddle deep learning training framework, with ultra-fast inference speed |
|
||||
| ... |... |
|
||||
2
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/.gitattributes
vendored
Normal file
2
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/.gitattributes
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
*.html linguist-language=python
|
||||
*.ipynb linguist-language=python
|
||||
16
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/.gitignore
vendored
Normal file
16
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/.gitignore
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
.DS_Store
|
||||
*.pth
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.log
|
||||
*.tmp
|
||||
*.pkl
|
||||
__pycache__/
|
||||
.idea/
|
||||
output/
|
||||
test/*.jpg
|
||||
datasets/
|
||||
index/
|
||||
train_log/
|
||||
log/
|
||||
profiling_log/
|
||||
201
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/LICENSE.md
Normal file
201
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/LICENSE.md
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
132
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/README.MD
Normal file
132
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/README.MD
Normal file
@ -0,0 +1,132 @@
|
||||
# Real-time Scene Text Detection with Differentiable Binarization
|
||||
|
||||
**note**: some code is inherited from [WenmuZhou/DBNet.pytorch](https://github.com/WenmuZhou/DBNet.pytorch)
|
||||
|
||||
[中文解读](https://zhuanlan.zhihu.com/p/94677957)
|
||||
|
||||

|
||||
|
||||
## update
|
||||
2020-06-07: 添加灰度图训练,训练灰度图时需要在配置里移除`dataset.args.transforms.Normalize`
|
||||
|
||||
## Install Using Conda
|
||||
```
|
||||
conda env create -f environment.yml
|
||||
git clone https://github.com/WenmuZhou/DBNet.paddle.git
|
||||
cd DBNet.paddle/
|
||||
```
|
||||
|
||||
or
|
||||
## Install Manually
|
||||
```bash
|
||||
conda create -n dbnet python=3.6
|
||||
conda activate dbnet
|
||||
|
||||
conda install ipython pip
|
||||
|
||||
# python dependencies
|
||||
pip install -r requirement.txt
|
||||
|
||||
# clone repo
|
||||
git clone https://github.com/WenmuZhou/DBNet.paddle.git
|
||||
cd DBNet.paddle/
|
||||
|
||||
```
|
||||
|
||||
## Requirements
|
||||
* paddlepaddle 2.4+
|
||||
|
||||
## Download
|
||||
|
||||
TBD
|
||||
|
||||
## Data Preparation
|
||||
|
||||
Training data: prepare a text `train.txt` in the following format, use '\t' as a separator
|
||||
```
|
||||
./datasets/train/img/001.jpg ./datasets/train/gt/001.txt
|
||||
```
|
||||
|
||||
Validation data: prepare a text `test.txt` in the following format, use '\t' as a separator
|
||||
```
|
||||
./datasets/test/img/001.jpg ./datasets/test/gt/001.txt
|
||||
```
|
||||
- Store images in the `img` folder
|
||||
- Store groundtruth in the `gt` folder
|
||||
|
||||
The groundtruth can be `.txt` files, with the following format:
|
||||
```
|
||||
x1, y1, x2, y2, x3, y3, x4, y4, annotation
|
||||
```
|
||||
|
||||
|
||||
## Train
|
||||
1. config the `dataset['train']['dataset'['data_path']'`,`dataset['validate']['dataset'['data_path']`in [config/icdar2015_resnet18_fpn_DBhead_polyLR.yaml](cconfig/icdar2015_resnet18_fpn_DBhead_polyLR.yaml)
|
||||
* . single gpu train
|
||||
```bash
|
||||
bash single_gpu_train.sh
|
||||
```
|
||||
* . Multi-gpu training
|
||||
```bash
|
||||
bash multi_gpu_train.sh
|
||||
```
|
||||
## Test
|
||||
|
||||
[eval.py](tools/eval.py) is used to test model on test dataset
|
||||
|
||||
1. config `model_path` in [eval.sh](eval.sh)
|
||||
2. use following script to test
|
||||
```bash
|
||||
bash eval.sh
|
||||
```
|
||||
|
||||
## Predict
|
||||
[predict.py](tools/predict.py) Can be used to inference on all images in a folder
|
||||
1. config `model_path`,`input_folder`,`output_folder` in [predict.sh](predict.sh)
|
||||
2. use following script to predict
|
||||
```
|
||||
bash predict.sh
|
||||
```
|
||||
You can change the `model_path` in the `predict.sh` file to your model location.
|
||||
|
||||
tips: if result is not good, you can change `thre` in [predict.sh](predict.sh)
|
||||
|
||||
## Export Model
|
||||
|
||||
[export_model.py](tools/export_model.py) Can be used to inference on all images in a folder
|
||||
|
||||
use following script to export inference model
|
||||
```
|
||||
python tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.resume_checkpoint=model_best.pth trainer.output_dir=output/infer
|
||||
```
|
||||
|
||||
## Paddle Inference infer
|
||||
|
||||
[infer.py](tools/infer.py) Can be used to inference on all images in a folder
|
||||
|
||||
use following script to export inference model
|
||||
```
|
||||
python tools/infer.py --model-dir=output/infer/ --img-path imgs/paper/db.jpg
|
||||
```
|
||||
|
||||
<h2 id="Performance">Performance</h2>
|
||||
|
||||
### [ICDAR 2015](http://rrc.cvc.uab.es/?ch=4)
|
||||
only train on ICDAR2015 dataset
|
||||
|
||||
| Method | image size (short size) |learning rate | Precision (%) | Recall (%) | F-measure (%) | FPS |
|
||||
|:--------------------------:|:-------:|:--------:|:--------:|:------------:|:---------------:|:-----:|
|
||||
| ImageNet-resnet50-FPN-DBHead(torch) |736 |1e-3|90.19 | 78.14 | 83.88 | 27 |
|
||||
| ImageNet-resnet50-FPN-DBHead(paddle) |736 |1e-3| 89.47 | 79.03 | 83.92 | 27 |
|
||||
| ImageNet-resnet50-FPN-DBHead(paddle_amp) |736 |1e-3| 88.62 | 79.95 | 84.06 | 27 |
|
||||
|
||||
|
||||
### examples
|
||||
TBD
|
||||
|
||||
|
||||
### reference
|
||||
1. https://arxiv.org/pdf/1911.08947.pdf
|
||||
2. https://github.com/WenmuZhou/DBNet.pytorch
|
||||
|
||||
**If this repository helps you,please star it. Thanks.**
|
||||
@ -0,0 +1,2 @@
|
||||
from .base_trainer import BaseTrainer
|
||||
from .base_dataset import BaseDataSet
|
||||
@ -0,0 +1,86 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/4 13:12
|
||||
# @Author : zhoujun
|
||||
import copy
|
||||
from paddle.io import Dataset
|
||||
from data_loader.modules import *
|
||||
|
||||
|
||||
class BaseDataSet(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
data_path: str,
|
||||
img_mode,
|
||||
pre_processes,
|
||||
filter_keys,
|
||||
ignore_tags,
|
||||
transform=None,
|
||||
target_transform=None,
|
||||
):
|
||||
assert img_mode in ["RGB", "BRG", "GRAY"]
|
||||
self.ignore_tags = ignore_tags
|
||||
self.data_list = self.load_data(data_path)
|
||||
item_keys = ["img_path", "img_name", "text_polys", "texts", "ignore_tags"]
|
||||
for item in item_keys:
|
||||
assert (
|
||||
item in self.data_list[0]
|
||||
), "data_list from load_data must contains {}".format(item_keys)
|
||||
self.img_mode = img_mode
|
||||
self.filter_keys = filter_keys
|
||||
self.transform = transform
|
||||
self.target_transform = target_transform
|
||||
self._init_pre_processes(pre_processes)
|
||||
|
||||
def _init_pre_processes(self, pre_processes):
|
||||
self.aug = []
|
||||
if pre_processes is not None:
|
||||
for aug in pre_processes:
|
||||
if "args" not in aug:
|
||||
args = {}
|
||||
else:
|
||||
args = aug["args"]
|
||||
if isinstance(args, dict):
|
||||
cls = eval(aug["type"])(**args)
|
||||
else:
|
||||
cls = eval(aug["type"])(args)
|
||||
self.aug.append(cls)
|
||||
|
||||
def load_data(self, data_path: str) -> list:
|
||||
"""
|
||||
把数据加载为一个list:
|
||||
:params data_path: 存储数据的文件夹或者文件
|
||||
return a dict ,包含了,'img_path','img_name','text_polys','texts','ignore_tags'
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_pre_processes(self, data):
|
||||
for aug in self.aug:
|
||||
data = aug(data)
|
||||
return data
|
||||
|
||||
def __getitem__(self, index):
|
||||
try:
|
||||
data = copy.deepcopy(self.data_list[index])
|
||||
im = cv2.imread(data["img_path"], 1 if self.img_mode != "GRAY" else 0)
|
||||
if self.img_mode == "RGB":
|
||||
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
|
||||
data["img"] = im
|
||||
data["shape"] = [im.shape[0], im.shape[1]]
|
||||
data = self.apply_pre_processes(data)
|
||||
|
||||
if self.transform:
|
||||
data["img"] = self.transform(data["img"])
|
||||
data["text_polys"] = data["text_polys"].tolist()
|
||||
if len(self.filter_keys):
|
||||
data_dict = {}
|
||||
for k, v in data.items():
|
||||
if k not in self.filter_keys:
|
||||
data_dict[k] = v
|
||||
return data_dict
|
||||
else:
|
||||
return data
|
||||
except:
|
||||
return self.__getitem__(np.random.randint(self.__len__()))
|
||||
|
||||
def __len__(self):
|
||||
return len(self.data_list)
|
||||
269
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/base/base_trainer.py
Normal file
269
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/base/base_trainer.py
Normal file
@ -0,0 +1,269 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:50
|
||||
# @Author : zhoujun
|
||||
|
||||
import os
|
||||
import pathlib
|
||||
import shutil
|
||||
from pprint import pformat
|
||||
|
||||
import anyconfig
|
||||
import paddle
|
||||
import numpy as np
|
||||
import random
|
||||
from paddle.jit import to_static
|
||||
from paddle.static import InputSpec
|
||||
|
||||
from utils import setup_logger
|
||||
|
||||
|
||||
class BaseTrainer:
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
model,
|
||||
criterion,
|
||||
train_loader,
|
||||
validate_loader,
|
||||
metric_cls,
|
||||
post_process=None,
|
||||
):
|
||||
config["trainer"]["output_dir"] = os.path.join(
|
||||
str(pathlib.Path(os.path.abspath(__name__)).parent),
|
||||
config["trainer"]["output_dir"],
|
||||
)
|
||||
config["name"] = config["name"] + "_" + model.name
|
||||
self.save_dir = config["trainer"]["output_dir"]
|
||||
self.checkpoint_dir = os.path.join(self.save_dir, "checkpoint")
|
||||
|
||||
os.makedirs(self.checkpoint_dir, exist_ok=True)
|
||||
|
||||
self.global_step = 0
|
||||
self.start_epoch = 0
|
||||
self.config = config
|
||||
self.criterion = criterion
|
||||
# logger and tensorboard
|
||||
self.visualdl_enable = self.config["trainer"].get("visual_dl", False)
|
||||
self.epochs = self.config["trainer"]["epochs"]
|
||||
self.log_iter = self.config["trainer"]["log_iter"]
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
anyconfig.dump(config, os.path.join(self.save_dir, "config.yaml"))
|
||||
self.logger = setup_logger(os.path.join(self.save_dir, "train.log"))
|
||||
self.logger_info(pformat(self.config))
|
||||
|
||||
self.model = self.apply_to_static(model)
|
||||
|
||||
# device
|
||||
if (
|
||||
paddle.device.cuda.device_count() > 0
|
||||
and paddle.device.is_compiled_with_cuda()
|
||||
):
|
||||
self.with_cuda = True
|
||||
random.seed(self.config["trainer"]["seed"])
|
||||
np.random.seed(self.config["trainer"]["seed"])
|
||||
paddle.seed(self.config["trainer"]["seed"])
|
||||
else:
|
||||
self.with_cuda = False
|
||||
self.logger_info("train with and paddle {}".format(paddle.__version__))
|
||||
# metrics
|
||||
self.metrics = {
|
||||
"recall": 0,
|
||||
"precision": 0,
|
||||
"hmean": 0,
|
||||
"train_loss": float("inf"),
|
||||
"best_model_epoch": 0,
|
||||
}
|
||||
|
||||
self.train_loader = train_loader
|
||||
if validate_loader is not None:
|
||||
assert post_process is not None and metric_cls is not None
|
||||
self.validate_loader = validate_loader
|
||||
self.post_process = post_process
|
||||
self.metric_cls = metric_cls
|
||||
self.train_loader_len = len(train_loader)
|
||||
|
||||
if self.validate_loader is not None:
|
||||
self.logger_info(
|
||||
"train dataset has {} samples,{} in dataloader, validate dataset has {} samples,{} in dataloader".format(
|
||||
len(self.train_loader.dataset),
|
||||
self.train_loader_len,
|
||||
len(self.validate_loader.dataset),
|
||||
len(self.validate_loader),
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.logger_info(
|
||||
"train dataset has {} samples,{} in dataloader".format(
|
||||
len(self.train_loader.dataset), self.train_loader_len
|
||||
)
|
||||
)
|
||||
|
||||
self._initialize_scheduler()
|
||||
|
||||
self._initialize_optimizer()
|
||||
|
||||
# resume or finetune
|
||||
if self.config["trainer"]["resume_checkpoint"] != "":
|
||||
self._load_checkpoint(
|
||||
self.config["trainer"]["resume_checkpoint"], resume=True
|
||||
)
|
||||
elif self.config["trainer"]["finetune_checkpoint"] != "":
|
||||
self._load_checkpoint(
|
||||
self.config["trainer"]["finetune_checkpoint"], resume=False
|
||||
)
|
||||
|
||||
if self.visualdl_enable and paddle.distributed.get_rank() == 0:
|
||||
from visualdl import LogWriter
|
||||
|
||||
self.writer = LogWriter(self.save_dir)
|
||||
|
||||
# 混合精度训练
|
||||
self.amp = self.config.get("amp", None)
|
||||
if self.amp == "None":
|
||||
self.amp = None
|
||||
if self.amp:
|
||||
self.amp["scaler"] = paddle.amp.GradScaler(
|
||||
init_loss_scaling=self.amp.get("scale_loss", 1024),
|
||||
use_dynamic_loss_scaling=self.amp.get("use_dynamic_loss_scaling", True),
|
||||
)
|
||||
self.model, self.optimizer = paddle.amp.decorate(
|
||||
models=self.model,
|
||||
optimizers=self.optimizer,
|
||||
level=self.amp.get("amp_level", "O2"),
|
||||
)
|
||||
|
||||
# 分布式训练
|
||||
if paddle.device.cuda.device_count() > 1:
|
||||
self.model = paddle.DataParallel(self.model)
|
||||
# make inverse Normalize
|
||||
self.UN_Normalize = False
|
||||
for t in self.config["dataset"]["train"]["dataset"]["args"]["transforms"]:
|
||||
if t["type"] == "Normalize":
|
||||
self.normalize_mean = t["args"]["mean"]
|
||||
self.normalize_std = t["args"]["std"]
|
||||
self.UN_Normalize = True
|
||||
|
||||
def apply_to_static(self, model):
|
||||
support_to_static = self.config["trainer"].get("to_static", False)
|
||||
if support_to_static:
|
||||
specs = None
|
||||
print("static")
|
||||
specs = [InputSpec([None, 3, -1, -1])]
|
||||
model = to_static(model, input_spec=specs)
|
||||
self.logger_info(
|
||||
"Successfully to apply @to_static with specs: {}".format(specs)
|
||||
)
|
||||
return model
|
||||
|
||||
def train(self):
|
||||
"""
|
||||
Full training logic
|
||||
"""
|
||||
for epoch in range(self.start_epoch + 1, self.epochs + 1):
|
||||
self.epoch_result = self._train_epoch(epoch)
|
||||
self._on_epoch_finish()
|
||||
if paddle.distributed.get_rank() == 0 and self.visualdl_enable:
|
||||
self.writer.close()
|
||||
self._on_train_finish()
|
||||
|
||||
def _train_epoch(self, epoch):
|
||||
"""
|
||||
Training logic for an epoch
|
||||
|
||||
:param epoch: Current epoch number
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _eval(self, epoch):
|
||||
"""
|
||||
eval logic for an epoch
|
||||
|
||||
:param epoch: Current epoch number
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _on_epoch_finish(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def _on_train_finish(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def _save_checkpoint(self, epoch, file_name):
|
||||
"""
|
||||
Saving checkpoints
|
||||
|
||||
:param epoch: current epoch number
|
||||
:param log: logging information of the epoch
|
||||
:param save_best: if True, rename the saved checkpoint to 'model_best.pth.tar'
|
||||
"""
|
||||
state_dict = self.model.state_dict()
|
||||
state = {
|
||||
"epoch": epoch,
|
||||
"global_step": self.global_step,
|
||||
"state_dict": state_dict,
|
||||
"optimizer": self.optimizer.state_dict(),
|
||||
"config": self.config,
|
||||
"metrics": self.metrics,
|
||||
}
|
||||
filename = os.path.join(self.checkpoint_dir, file_name)
|
||||
paddle.save(state, filename)
|
||||
|
||||
def _load_checkpoint(self, checkpoint_path, resume):
|
||||
"""
|
||||
Resume from saved checkpoints
|
||||
:param checkpoint_path: Checkpoint path to be resumed
|
||||
"""
|
||||
self.logger_info("Loading checkpoint: {} ...".format(checkpoint_path))
|
||||
checkpoint = paddle.load(checkpoint_path)
|
||||
self.model.set_state_dict(checkpoint["state_dict"])
|
||||
if resume:
|
||||
self.global_step = checkpoint["global_step"]
|
||||
self.start_epoch = checkpoint["epoch"]
|
||||
self.config["lr_scheduler"]["args"]["last_epoch"] = self.start_epoch
|
||||
# self.scheduler.load_state_dict(checkpoint['scheduler'])
|
||||
self.optimizer.set_state_dict(checkpoint["optimizer"])
|
||||
if "metrics" in checkpoint:
|
||||
self.metrics = checkpoint["metrics"]
|
||||
self.logger_info(
|
||||
"resume from checkpoint {} (epoch {})".format(
|
||||
checkpoint_path, self.start_epoch
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.logger_info("finetune from checkpoint {}".format(checkpoint_path))
|
||||
|
||||
def _initialize(self, name, module, *args, **kwargs):
|
||||
module_name = self.config[name]["type"]
|
||||
module_args = self.config[name].get("args", {})
|
||||
assert all(
|
||||
[k not in module_args for k in kwargs]
|
||||
), "Overwriting kwargs given in config file is not allowed"
|
||||
module_args.update(kwargs)
|
||||
return getattr(module, module_name)(*args, **module_args)
|
||||
|
||||
def _initialize_scheduler(self):
|
||||
self.lr_scheduler = self._initialize("lr_scheduler", paddle.optimizer.lr)
|
||||
|
||||
def _initialize_optimizer(self):
|
||||
self.optimizer = self._initialize(
|
||||
"optimizer",
|
||||
paddle.optimizer,
|
||||
parameters=self.model.parameters(),
|
||||
learning_rate=self.lr_scheduler,
|
||||
)
|
||||
|
||||
def inverse_normalize(self, batch_img):
|
||||
if self.UN_Normalize:
|
||||
batch_img[:, 0, :, :] = (
|
||||
batch_img[:, 0, :, :] * self.normalize_std[0] + self.normalize_mean[0]
|
||||
)
|
||||
batch_img[:, 1, :, :] = (
|
||||
batch_img[:, 1, :, :] * self.normalize_std[1] + self.normalize_mean[1]
|
||||
)
|
||||
batch_img[:, 2, :, :] = (
|
||||
batch_img[:, 2, :, :] * self.normalize_std[2] + self.normalize_mean[2]
|
||||
)
|
||||
|
||||
def logger_info(self, s):
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
self.logger.info(s)
|
||||
@ -0,0 +1,40 @@
|
||||
name: DBNet
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
type: SynthTextDataset # 数据集类型
|
||||
args:
|
||||
data_path: ''# SynthTextDataset 根目录
|
||||
pre_processes: # 数据的预处理过程,包含augment和标签制作
|
||||
- type: IaaAugment # 使用imgaug进行变换
|
||||
args:
|
||||
- {'type':Fliplr, 'args':{'p':0.5}}
|
||||
- {'type': Affine, 'args':{'rotate':[-10,10]}}
|
||||
- {'type':Resize,'args':{'size':[0.5,3]}}
|
||||
- type: EastRandomCropData
|
||||
args:
|
||||
size: [640,640]
|
||||
max_tries: 50
|
||||
keep_ratio: true
|
||||
- type: MakeBorderMap
|
||||
args:
|
||||
shrink_ratio: 0.4
|
||||
- type: MakeShrinkMap
|
||||
args:
|
||||
shrink_ratio: 0.4
|
||||
min_text_size: 8
|
||||
transforms: # 对图片进行的变换方式
|
||||
- type: ToTensor
|
||||
args: {}
|
||||
- type: Normalize
|
||||
args:
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
img_mode: RGB
|
||||
filter_keys: ['img_path','img_name','text_polys','texts','ignore_tags','shape'] # 返回数据之前,从数据字典里删除的key
|
||||
ignore_tags: ['*', '###']
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 0
|
||||
collate_fn: ''
|
||||
@ -0,0 +1,65 @@
|
||||
name: DBNet
|
||||
base: ['config/SynthText.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: resnet18
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
args:
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
amsgrad: true
|
||||
lr_scheduler:
|
||||
type: WarmupPolyLR
|
||||
args:
|
||||
warmup_epoch: 3
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 1200
|
||||
log_iter: 10
|
||||
show_images_iter: 50
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path: ./datasets/SynthText
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 2
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
@ -0,0 +1,69 @@
|
||||
name: DBNet
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
type: ICDAR2015Dataset # 数据集类型
|
||||
args:
|
||||
data_path: # 一个存放 img_path \t gt_path的文件
|
||||
- ''
|
||||
pre_processes: # 数据的预处理过程,包含augment和标签制作
|
||||
- type: IaaAugment # 使用imgaug进行变换
|
||||
args:
|
||||
- {'type':Fliplr, 'args':{'p':0.5}}
|
||||
- {'type': Affine, 'args':{'rotate':[-10,10]}}
|
||||
- {'type':Resize,'args':{'size':[0.5,3]}}
|
||||
- type: EastRandomCropData
|
||||
args:
|
||||
size: [640,640]
|
||||
max_tries: 50
|
||||
keep_ratio: true
|
||||
- type: MakeBorderMap
|
||||
args:
|
||||
shrink_ratio: 0.4
|
||||
thresh_min: 0.3
|
||||
thresh_max: 0.7
|
||||
- type: MakeShrinkMap
|
||||
args:
|
||||
shrink_ratio: 0.4
|
||||
min_text_size: 8
|
||||
transforms: # 对图片进行的变换方式
|
||||
- type: ToTensor
|
||||
args: {}
|
||||
- type: Normalize
|
||||
args:
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
img_mode: RGB
|
||||
filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前,从数据字典里删除的key
|
||||
ignore_tags: ['*', '###']
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 0
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
type: ICDAR2015Dataset
|
||||
args:
|
||||
data_path:
|
||||
- ''
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
transforms:
|
||||
- type: ToTensor
|
||||
args: {}
|
||||
- type: Normalize
|
||||
args:
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
img_mode: RGB
|
||||
filter_keys: []
|
||||
ignore_tags: ['*', '###']
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 0
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,82 @@
|
||||
name: DBNet
|
||||
base: ['config/icdar2015.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: deformable_resnet18
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
args:
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
amsgrad: true
|
||||
lr_scheduler:
|
||||
type: WarmupPolyLR
|
||||
args:
|
||||
warmup_epoch: 3
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 1200
|
||||
log_iter: 10
|
||||
show_images_iter: 50
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/train.txt
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/test.txt
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,82 @@
|
||||
name: DBNet
|
||||
base: ['config/icdar2015.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: resnet18
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
args:
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
amsgrad: true
|
||||
lr_scheduler:
|
||||
type: WarmupPolyLR
|
||||
args:
|
||||
warmup_epoch: 3
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 1200
|
||||
log_iter: 10
|
||||
show_images_iter: 50
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/train.txt
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/test.txt
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,83 @@
|
||||
name: DBNet
|
||||
base: ['config/icdar2015.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: resnet18
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
args:
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
amsgrad: true
|
||||
lr_scheduler:
|
||||
type: StepLR
|
||||
args:
|
||||
step_size: 10
|
||||
gama: 0.8
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 500
|
||||
log_iter: 10
|
||||
show_images_iter: 50
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/train.txt
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/test.txt
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,79 @@
|
||||
name: DBNet
|
||||
base: ['config/icdar2015.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: resnet50
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
lr_scheduler:
|
||||
type: Polynomial
|
||||
args:
|
||||
learning_rate: 0.001
|
||||
warmup_epoch: 3
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 1200
|
||||
log_iter: 10
|
||||
show_images_iter: 50
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output/fp16_o2
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/train.txt
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 16
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/test.txt
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
img_mode: RGB
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,73 @@
|
||||
name: DBNet
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
type: DetDataset # 数据集类型
|
||||
args:
|
||||
data_path: # 一个存放 img_path \t gt_path的文件
|
||||
- ''
|
||||
pre_processes: # 数据的预处理过程,包含augment和标签制作
|
||||
- type: IaaAugment # 使用imgaug进行变换
|
||||
args:
|
||||
- {'type':Fliplr, 'args':{'p':0.5}}
|
||||
- {'type': Affine, 'args':{'rotate':[-10,10]}}
|
||||
- {'type':Resize,'args':{'size':[0.5,3]}}
|
||||
- type: EastRandomCropData
|
||||
args:
|
||||
size: [640,640]
|
||||
max_tries: 50
|
||||
keep_ratio: true
|
||||
- type: MakeBorderMap
|
||||
args:
|
||||
shrink_ratio: 0.4
|
||||
thresh_min: 0.3
|
||||
thresh_max: 0.7
|
||||
- type: MakeShrinkMap
|
||||
args:
|
||||
shrink_ratio: 0.4
|
||||
min_text_size: 8
|
||||
transforms: # 对图片进行的变换方式
|
||||
- type: ToTensor
|
||||
args: {}
|
||||
- type: Normalize
|
||||
args:
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
img_mode: RGB
|
||||
load_char_annotation: false
|
||||
expand_one_char: false
|
||||
filter_keys: [img_path,img_name,text_polys,texts,ignore_tags,shape] # 返回数据之前,从数据字典里删除的key
|
||||
ignore_tags: ['*', '###']
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 0
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
type: DetDataset
|
||||
args:
|
||||
data_path:
|
||||
- ''
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
transforms:
|
||||
- type: ToTensor
|
||||
args: {}
|
||||
- type: Normalize
|
||||
args:
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
img_mode: RGB
|
||||
load_char_annotation: false # 是否加载字符级标注
|
||||
expand_one_char: false # 是否对只有一个字符的框进行宽度扩充,扩充后w = w+h
|
||||
filter_keys: []
|
||||
ignore_tags: ['*', '###']
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 0
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,86 @@
|
||||
name: DBNet
|
||||
base: ['config/open_dataset.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: deformable_resnet18
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
args:
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
amsgrad: true
|
||||
lr_scheduler:
|
||||
type: WarmupPolyLR
|
||||
args:
|
||||
warmup_epoch: 3
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 1200
|
||||
log_iter: 1
|
||||
show_images_iter: 1
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/train.json
|
||||
img_mode: RGB
|
||||
load_char_annotation: false
|
||||
expand_one_char: false
|
||||
loader:
|
||||
batch_size: 2
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/test.json
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
img_mode: RGB
|
||||
load_char_annotation: false
|
||||
expand_one_char: false
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,86 @@
|
||||
name: DBNet
|
||||
base: ['config/open_dataset.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: resnest50
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
args:
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
amsgrad: true
|
||||
lr_scheduler:
|
||||
type: WarmupPolyLR
|
||||
args:
|
||||
warmup_epoch: 3
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 1200
|
||||
log_iter: 1
|
||||
show_images_iter: 1
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/train.json
|
||||
img_mode: RGB
|
||||
load_char_annotation: false
|
||||
expand_one_char: false
|
||||
loader:
|
||||
batch_size: 2
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/test.json
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
img_mode: RGB
|
||||
load_char_annotation: false
|
||||
expand_one_char: false
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,93 @@
|
||||
name: DBNet
|
||||
base: ['config/open_dataset.yaml']
|
||||
arch:
|
||||
type: Model
|
||||
backbone:
|
||||
type: resnet18
|
||||
pretrained: true
|
||||
neck:
|
||||
type: FPN
|
||||
inner_channels: 256
|
||||
head:
|
||||
type: DBHead
|
||||
out_channels: 2
|
||||
k: 50
|
||||
post_processing:
|
||||
type: SegDetectorRepresenter
|
||||
args:
|
||||
thresh: 0.3
|
||||
box_thresh: 0.7
|
||||
max_candidates: 1000
|
||||
unclip_ratio: 1.5 # from paper
|
||||
metric:
|
||||
type: QuadMetric
|
||||
args:
|
||||
is_output_polygon: false
|
||||
loss:
|
||||
type: DBLoss
|
||||
alpha: 1
|
||||
beta: 10
|
||||
ohem_ratio: 3
|
||||
optimizer:
|
||||
type: Adam
|
||||
args:
|
||||
lr: 0.001
|
||||
weight_decay: 0
|
||||
amsgrad: true
|
||||
lr_scheduler:
|
||||
type: WarmupPolyLR
|
||||
args:
|
||||
warmup_epoch: 3
|
||||
trainer:
|
||||
seed: 2
|
||||
epochs: 1200
|
||||
log_iter: 1
|
||||
show_images_iter: 1
|
||||
resume_checkpoint: ''
|
||||
finetune_checkpoint: ''
|
||||
output_dir: output
|
||||
visual_dl: false
|
||||
amp:
|
||||
scale_loss: 1024
|
||||
amp_level: O2
|
||||
custom_white_list: []
|
||||
custom_black_list: ['exp', 'sigmoid', 'concat']
|
||||
dataset:
|
||||
train:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/train.json
|
||||
transforms: # 对图片进行的变换方式
|
||||
- type: ToTensor
|
||||
args: {}
|
||||
- type: Normalize
|
||||
args:
|
||||
mean: [0.485, 0.456, 0.406]
|
||||
std: [0.229, 0.224, 0.225]
|
||||
img_mode: RGB
|
||||
load_char_annotation: false
|
||||
expand_one_char: false
|
||||
loader:
|
||||
batch_size: 2
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ''
|
||||
validate:
|
||||
dataset:
|
||||
args:
|
||||
data_path:
|
||||
- ./datasets/test.json
|
||||
pre_processes:
|
||||
- type: ResizeShortSize
|
||||
args:
|
||||
short_size: 736
|
||||
resize_text_polys: false
|
||||
img_mode: RGB
|
||||
load_char_annotation: false
|
||||
expand_one_char: false
|
||||
loader:
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
num_workers: 6
|
||||
collate_fn: ICDARCollectFN
|
||||
@ -0,0 +1,114 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:52
|
||||
# @Author : zhoujun
|
||||
import copy
|
||||
|
||||
import PIL
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle.io import DataLoader, DistributedBatchSampler, BatchSampler
|
||||
|
||||
from paddle.vision import transforms
|
||||
|
||||
|
||||
def get_dataset(data_path, module_name, transform, dataset_args):
|
||||
"""
|
||||
获取训练dataset
|
||||
:param data_path: dataset文件列表,每个文件内以如下格式存储 ‘path/to/img\tlabel’
|
||||
:param module_name: 所使用的自定义dataset名称,目前只支持data_loaders.ImageDataset
|
||||
:param transform: 该数据集使用的transforms
|
||||
:param dataset_args: module_name的参数
|
||||
:return: 如果data_path列表不为空,返回对于的ConcatDataset对象,否则None
|
||||
"""
|
||||
from . import dataset
|
||||
|
||||
s_dataset = getattr(dataset, module_name)(
|
||||
transform=transform, data_path=data_path, **dataset_args
|
||||
)
|
||||
return s_dataset
|
||||
|
||||
|
||||
def get_transforms(transforms_config):
|
||||
tr_list = []
|
||||
for item in transforms_config:
|
||||
if "args" not in item:
|
||||
args = {}
|
||||
else:
|
||||
args = item["args"]
|
||||
cls = getattr(transforms, item["type"])(**args)
|
||||
tr_list.append(cls)
|
||||
tr_list = transforms.Compose(tr_list)
|
||||
return tr_list
|
||||
|
||||
|
||||
class ICDARCollectFN:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def __call__(self, batch):
|
||||
data_dict = {}
|
||||
to_tensor_keys = []
|
||||
for sample in batch:
|
||||
for k, v in sample.items():
|
||||
if k not in data_dict:
|
||||
data_dict[k] = []
|
||||
if isinstance(v, (np.ndarray, paddle.Tensor, PIL.Image.Image)):
|
||||
if k not in to_tensor_keys:
|
||||
to_tensor_keys.append(k)
|
||||
data_dict[k].append(v)
|
||||
for k in to_tensor_keys:
|
||||
data_dict[k] = paddle.stack(data_dict[k], 0)
|
||||
return data_dict
|
||||
|
||||
|
||||
def get_dataloader(module_config, distributed=False):
|
||||
if module_config is None:
|
||||
return None
|
||||
config = copy.deepcopy(module_config)
|
||||
dataset_args = config["dataset"]["args"]
|
||||
if "transforms" in dataset_args:
|
||||
img_transforms = get_transforms(dataset_args.pop("transforms"))
|
||||
else:
|
||||
img_transforms = None
|
||||
# 创建数据集
|
||||
dataset_name = config["dataset"]["type"]
|
||||
data_path = dataset_args.pop("data_path")
|
||||
if data_path == None:
|
||||
return None
|
||||
|
||||
data_path = [x for x in data_path if x is not None]
|
||||
if len(data_path) == 0:
|
||||
return None
|
||||
if (
|
||||
"collate_fn" not in config["loader"]
|
||||
or config["loader"]["collate_fn"] is None
|
||||
or len(config["loader"]["collate_fn"]) == 0
|
||||
):
|
||||
config["loader"]["collate_fn"] = None
|
||||
else:
|
||||
config["loader"]["collate_fn"] = eval(config["loader"]["collate_fn"])()
|
||||
|
||||
_dataset = get_dataset(
|
||||
data_path=data_path,
|
||||
module_name=dataset_name,
|
||||
transform=img_transforms,
|
||||
dataset_args=dataset_args,
|
||||
)
|
||||
sampler = None
|
||||
if distributed:
|
||||
# 3)使用DistributedSampler
|
||||
batch_sampler = DistributedBatchSampler(
|
||||
dataset=_dataset,
|
||||
batch_size=config["loader"].pop("batch_size"),
|
||||
shuffle=config["loader"].pop("shuffle"),
|
||||
)
|
||||
else:
|
||||
batch_sampler = BatchSampler(
|
||||
dataset=_dataset,
|
||||
batch_size=config["loader"].pop("batch_size"),
|
||||
shuffle=config["loader"].pop("shuffle"),
|
||||
)
|
||||
loader = DataLoader(
|
||||
dataset=_dataset, batch_sampler=batch_sampler, **config["loader"]
|
||||
)
|
||||
return loader
|
||||
190
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/data_loader/dataset.py
Normal file
190
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/data_loader/dataset.py
Normal file
@ -0,0 +1,190 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:54
|
||||
# @Author : zhoujun
|
||||
import pathlib
|
||||
import os
|
||||
import cv2
|
||||
import numpy as np
|
||||
import scipy.io as sio
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
from base import BaseDataSet
|
||||
from utils import order_points_clockwise, get_datalist, load, expand_polygon
|
||||
|
||||
|
||||
class ICDAR2015Dataset(BaseDataSet):
|
||||
def __init__(
|
||||
self,
|
||||
data_path: str,
|
||||
img_mode,
|
||||
pre_processes,
|
||||
filter_keys,
|
||||
ignore_tags,
|
||||
transform=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(
|
||||
data_path, img_mode, pre_processes, filter_keys, ignore_tags, transform
|
||||
)
|
||||
|
||||
def load_data(self, data_path: str) -> list:
|
||||
data_list = get_datalist(data_path)
|
||||
t_data_list = []
|
||||
for img_path, label_path in data_list:
|
||||
data = self._get_annotation(label_path)
|
||||
if len(data["text_polys"]) > 0:
|
||||
item = {"img_path": img_path, "img_name": pathlib.Path(img_path).stem}
|
||||
item.update(data)
|
||||
t_data_list.append(item)
|
||||
else:
|
||||
print("there is no suit bbox in {}".format(label_path))
|
||||
return t_data_list
|
||||
|
||||
def _get_annotation(self, label_path: str) -> dict:
|
||||
boxes = []
|
||||
texts = []
|
||||
ignores = []
|
||||
with open(label_path, encoding="utf-8", mode="r") as f:
|
||||
for line in f.readlines():
|
||||
params = line.strip().strip("\ufeff").strip("\xef\xbb\xbf").split(",")
|
||||
try:
|
||||
box = order_points_clockwise(
|
||||
np.array(list(map(float, params[:8]))).reshape(-1, 2)
|
||||
)
|
||||
if cv2.contourArea(box) > 0:
|
||||
boxes.append(box)
|
||||
label = params[8]
|
||||
texts.append(label)
|
||||
ignores.append(label in self.ignore_tags)
|
||||
except:
|
||||
print("load label failed on {}".format(label_path))
|
||||
data = {
|
||||
"text_polys": np.array(boxes),
|
||||
"texts": texts,
|
||||
"ignore_tags": ignores,
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
class DetDataset(BaseDataSet):
|
||||
def __init__(
|
||||
self,
|
||||
data_path: str,
|
||||
img_mode,
|
||||
pre_processes,
|
||||
filter_keys,
|
||||
ignore_tags,
|
||||
transform=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.load_char_annotation = kwargs["load_char_annotation"]
|
||||
self.expand_one_char = kwargs["expand_one_char"]
|
||||
super().__init__(
|
||||
data_path, img_mode, pre_processes, filter_keys, ignore_tags, transform
|
||||
)
|
||||
|
||||
def load_data(self, data_path: str) -> list:
|
||||
"""
|
||||
从json文件中读取出 文本行的坐标和gt,字符的坐标和gt
|
||||
:param data_path:
|
||||
:return:
|
||||
"""
|
||||
data_list = []
|
||||
for path in data_path:
|
||||
content = load(path)
|
||||
for gt in tqdm(content["data_list"], desc="read file {}".format(path)):
|
||||
img_path = os.path.join(content["data_root"], gt["img_name"])
|
||||
polygons = []
|
||||
texts = []
|
||||
illegibility_list = []
|
||||
language_list = []
|
||||
for annotation in gt["annotations"]:
|
||||
if len(annotation["polygon"]) == 0 or len(annotation["text"]) == 0:
|
||||
continue
|
||||
if len(annotation["text"]) > 1 and self.expand_one_char:
|
||||
annotation["polygon"] = expand_polygon(annotation["polygon"])
|
||||
polygons.append(annotation["polygon"])
|
||||
texts.append(annotation["text"])
|
||||
illegibility_list.append(annotation["illegibility"])
|
||||
language_list.append(annotation["language"])
|
||||
if self.load_char_annotation:
|
||||
for char_annotation in annotation["chars"]:
|
||||
if (
|
||||
len(char_annotation["polygon"]) == 0
|
||||
or len(char_annotation["char"]) == 0
|
||||
):
|
||||
continue
|
||||
polygons.append(char_annotation["polygon"])
|
||||
texts.append(char_annotation["char"])
|
||||
illegibility_list.append(char_annotation["illegibility"])
|
||||
language_list.append(char_annotation["language"])
|
||||
data_list.append(
|
||||
{
|
||||
"img_path": img_path,
|
||||
"img_name": gt["img_name"],
|
||||
"text_polys": np.array(polygons),
|
||||
"texts": texts,
|
||||
"ignore_tags": illegibility_list,
|
||||
}
|
||||
)
|
||||
return data_list
|
||||
|
||||
|
||||
class SynthTextDataset(BaseDataSet):
|
||||
def __init__(
|
||||
self,
|
||||
data_path: str,
|
||||
img_mode,
|
||||
pre_processes,
|
||||
filter_keys,
|
||||
transform=None,
|
||||
**kwargs,
|
||||
):
|
||||
self.transform = transform
|
||||
self.dataRoot = pathlib.Path(data_path)
|
||||
if not self.dataRoot.exists():
|
||||
raise FileNotFoundError("Dataset folder is not exist.")
|
||||
|
||||
self.targetFilePath = self.dataRoot / "gt.mat"
|
||||
if not self.targetFilePath.exists():
|
||||
raise FileExistsError("Target file is not exist.")
|
||||
targets = {}
|
||||
sio.loadmat(
|
||||
self.targetFilePath,
|
||||
targets,
|
||||
squeeze_me=True,
|
||||
struct_as_record=False,
|
||||
variable_names=["imnames", "wordBB", "txt"],
|
||||
)
|
||||
|
||||
self.imageNames = targets["imnames"]
|
||||
self.wordBBoxes = targets["wordBB"]
|
||||
self.transcripts = targets["txt"]
|
||||
super().__init__(data_path, img_mode, pre_processes, filter_keys, transform)
|
||||
|
||||
def load_data(self, data_path: str) -> list:
|
||||
t_data_list = []
|
||||
for imageName, wordBBoxes, texts in zip(
|
||||
self.imageNames, self.wordBBoxes, self.transcripts
|
||||
):
|
||||
item = {}
|
||||
wordBBoxes = (
|
||||
np.expand_dims(wordBBoxes, axis=2)
|
||||
if (wordBBoxes.ndim == 2)
|
||||
else wordBBoxes
|
||||
)
|
||||
_, _, numOfWords = wordBBoxes.shape
|
||||
text_polys = wordBBoxes.reshape(
|
||||
[8, numOfWords], order="F"
|
||||
).T # num_words * 8
|
||||
text_polys = text_polys.reshape(numOfWords, 4, 2) # num_of_words * 4 * 2
|
||||
transcripts = [word for line in texts for word in line.split()]
|
||||
if numOfWords != len(transcripts):
|
||||
continue
|
||||
item["img_path"] = str(self.dataRoot / imageName)
|
||||
item["img_name"] = (self.dataRoot / imageName).stem
|
||||
item["text_polys"] = text_polys
|
||||
item["texts"] = transcripts
|
||||
item["ignore_tags"] = [x in self.ignore_tags for x in transcripts]
|
||||
t_data_list.append(item)
|
||||
return t_data_list
|
||||
@ -0,0 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/4 10:53
|
||||
# @Author : zhoujun
|
||||
from .iaa_augment import IaaAugment
|
||||
from .augment import *
|
||||
from .random_crop_data import EastRandomCropData, PSERandomCrop
|
||||
from .make_border_map import MakeBorderMap
|
||||
from .make_shrink_map import MakeShrinkMap
|
||||
@ -0,0 +1,308 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:52
|
||||
# @Author : zhoujun
|
||||
|
||||
import math
|
||||
import numbers
|
||||
import random
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from skimage.util import random_noise
|
||||
|
||||
|
||||
class RandomNoise:
|
||||
def __init__(self, random_rate):
|
||||
self.random_rate = random_rate
|
||||
|
||||
def __call__(self, data: dict):
|
||||
"""
|
||||
对图片加噪声
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
if random.random() > self.random_rate:
|
||||
return data
|
||||
data["img"] = (
|
||||
random_noise(data["img"], mode="gaussian", clip=True) * 255
|
||||
).astype(data["img"].dtype)
|
||||
return data
|
||||
|
||||
|
||||
class RandomScale:
|
||||
def __init__(self, scales, random_rate):
|
||||
"""
|
||||
:param scales: 尺度
|
||||
:param random_rate: 随机系数
|
||||
:return:
|
||||
"""
|
||||
self.random_rate = random_rate
|
||||
self.scales = scales
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
if random.random() > self.random_rate:
|
||||
return data
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
|
||||
tmp_text_polys = text_polys.copy()
|
||||
rd_scale = float(np.random.choice(self.scales))
|
||||
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
|
||||
tmp_text_polys *= rd_scale
|
||||
|
||||
data["img"] = im
|
||||
data["text_polys"] = tmp_text_polys
|
||||
return data
|
||||
|
||||
|
||||
class RandomRotateImgBox:
|
||||
def __init__(self, degrees, random_rate, same_size=False):
|
||||
"""
|
||||
:param degrees: 角度,可以是一个数值或者list
|
||||
:param random_rate: 随机系数
|
||||
:param same_size: 是否保持和原图一样大
|
||||
:return:
|
||||
"""
|
||||
if isinstance(degrees, numbers.Number):
|
||||
if degrees < 0:
|
||||
raise ValueError("If degrees is a single number, it must be positive.")
|
||||
degrees = (-degrees, degrees)
|
||||
elif (
|
||||
isinstance(degrees, list)
|
||||
or isinstance(degrees, tuple)
|
||||
or isinstance(degrees, np.ndarray)
|
||||
):
|
||||
if len(degrees) != 2:
|
||||
raise ValueError("If degrees is a sequence, it must be of len 2.")
|
||||
degrees = degrees
|
||||
else:
|
||||
raise Exception("degrees must in Number or list or tuple or np.ndarray")
|
||||
self.degrees = degrees
|
||||
self.same_size = same_size
|
||||
self.random_rate = random_rate
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
if random.random() > self.random_rate:
|
||||
return data
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
|
||||
# ---------------------- 旋转图像 ----------------------
|
||||
w = im.shape[1]
|
||||
h = im.shape[0]
|
||||
angle = np.random.uniform(self.degrees[0], self.degrees[1])
|
||||
|
||||
if self.same_size:
|
||||
nw = w
|
||||
nh = h
|
||||
else:
|
||||
# 角度变弧度
|
||||
rangle = np.deg2rad(angle)
|
||||
# 计算旋转之后图像的w, h
|
||||
nw = abs(np.sin(rangle) * h) + abs(np.cos(rangle) * w)
|
||||
nh = abs(np.cos(rangle) * h) + abs(np.sin(rangle) * w)
|
||||
# 构造仿射矩阵
|
||||
rot_mat = cv2.getRotationMatrix2D((nw * 0.5, nh * 0.5), angle, 1)
|
||||
# 计算原图中心点到新图中心点的偏移量
|
||||
rot_move = np.dot(rot_mat, np.array([(nw - w) * 0.5, (nh - h) * 0.5, 0]))
|
||||
# 更新仿射矩阵
|
||||
rot_mat[0, 2] += rot_move[0]
|
||||
rot_mat[1, 2] += rot_move[1]
|
||||
# 仿射变换
|
||||
rot_img = cv2.warpAffine(
|
||||
im,
|
||||
rot_mat,
|
||||
(int(math.ceil(nw)), int(math.ceil(nh))),
|
||||
flags=cv2.INTER_LANCZOS4,
|
||||
)
|
||||
|
||||
# ---------------------- 矫正bbox坐标 ----------------------
|
||||
# rot_mat是最终的旋转矩阵
|
||||
# 获取原始bbox的四个中点,然后将这四个点转换到旋转后的坐标系下
|
||||
rot_text_polys = list()
|
||||
for bbox in text_polys:
|
||||
point1 = np.dot(rot_mat, np.array([bbox[0, 0], bbox[0, 1], 1]))
|
||||
point2 = np.dot(rot_mat, np.array([bbox[1, 0], bbox[1, 1], 1]))
|
||||
point3 = np.dot(rot_mat, np.array([bbox[2, 0], bbox[2, 1], 1]))
|
||||
point4 = np.dot(rot_mat, np.array([bbox[3, 0], bbox[3, 1], 1]))
|
||||
rot_text_polys.append([point1, point2, point3, point4])
|
||||
data["img"] = rot_img
|
||||
data["text_polys"] = np.array(rot_text_polys)
|
||||
return data
|
||||
|
||||
|
||||
class RandomResize:
|
||||
def __init__(self, size, random_rate, keep_ratio=False):
|
||||
"""
|
||||
:param input_size: resize尺寸,数字或者list的形式,如果为list形式,就是[w,h]
|
||||
:param random_rate: 随机系数
|
||||
:param keep_ratio: 是否保持长宽比
|
||||
:return:
|
||||
"""
|
||||
if isinstance(size, numbers.Number):
|
||||
if size < 0:
|
||||
raise ValueError(
|
||||
"If input_size is a single number, it must be positive."
|
||||
)
|
||||
size = (size, size)
|
||||
elif (
|
||||
isinstance(size, list)
|
||||
or isinstance(size, tuple)
|
||||
or isinstance(size, np.ndarray)
|
||||
):
|
||||
if len(size) != 2:
|
||||
raise ValueError("If input_size is a sequence, it must be of len 2.")
|
||||
size = (size[0], size[1])
|
||||
else:
|
||||
raise Exception("input_size must in Number or list or tuple or np.ndarray")
|
||||
self.size = size
|
||||
self.keep_ratio = keep_ratio
|
||||
self.random_rate = random_rate
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
if random.random() > self.random_rate:
|
||||
return data
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
|
||||
if self.keep_ratio:
|
||||
# 将图片短边pad到和长边一样
|
||||
h, w, c = im.shape
|
||||
max_h = max(h, self.size[0])
|
||||
max_w = max(w, self.size[1])
|
||||
im_padded = np.zeros((max_h, max_w, c), dtype=np.uint8)
|
||||
im_padded[:h, :w] = im.copy()
|
||||
im = im_padded
|
||||
text_polys = text_polys.astype(np.float32)
|
||||
h, w, _ = im.shape
|
||||
im = cv2.resize(im, self.size)
|
||||
w_scale = self.size[0] / float(w)
|
||||
h_scale = self.size[1] / float(h)
|
||||
text_polys[:, :, 0] *= w_scale
|
||||
text_polys[:, :, 1] *= h_scale
|
||||
|
||||
data["img"] = im
|
||||
data["text_polys"] = text_polys
|
||||
return data
|
||||
|
||||
|
||||
def resize_image(img, short_size):
|
||||
height, width, _ = img.shape
|
||||
if height < width:
|
||||
new_height = short_size
|
||||
new_width = new_height / height * width
|
||||
else:
|
||||
new_width = short_size
|
||||
new_height = new_width / width * height
|
||||
new_height = int(round(new_height / 32) * 32)
|
||||
new_width = int(round(new_width / 32) * 32)
|
||||
resized_img = cv2.resize(img, (new_width, new_height))
|
||||
return resized_img, (new_width / width, new_height / height)
|
||||
|
||||
|
||||
class ResizeShortSize:
|
||||
def __init__(self, short_size, resize_text_polys=True):
|
||||
"""
|
||||
:param size: resize尺寸,数字或者list的形式,如果为list形式,就是[w,h]
|
||||
:return:
|
||||
"""
|
||||
self.short_size = short_size
|
||||
self.resize_text_polys = resize_text_polys
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
|
||||
h, w, _ = im.shape
|
||||
short_edge = min(h, w)
|
||||
if short_edge < self.short_size:
|
||||
# 保证短边 >= short_size
|
||||
scale = self.short_size / short_edge
|
||||
im = cv2.resize(im, dsize=None, fx=scale, fy=scale)
|
||||
scale = (scale, scale)
|
||||
# im, scale = resize_image(im, self.short_size)
|
||||
if self.resize_text_polys:
|
||||
# text_polys *= scale
|
||||
text_polys[:, 0] *= scale[0]
|
||||
text_polys[:, 1] *= scale[1]
|
||||
|
||||
data["img"] = im
|
||||
data["text_polys"] = text_polys
|
||||
return data
|
||||
|
||||
|
||||
class HorizontalFlip:
|
||||
def __init__(self, random_rate):
|
||||
"""
|
||||
|
||||
:param random_rate: 随机系数
|
||||
"""
|
||||
self.random_rate = random_rate
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
if random.random() > self.random_rate:
|
||||
return data
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
|
||||
flip_text_polys = text_polys.copy()
|
||||
flip_im = cv2.flip(im, 1)
|
||||
h, w, _ = flip_im.shape
|
||||
flip_text_polys[:, :, 0] = w - flip_text_polys[:, :, 0]
|
||||
|
||||
data["img"] = flip_im
|
||||
data["text_polys"] = flip_text_polys
|
||||
return data
|
||||
|
||||
|
||||
class VerticalFlip:
|
||||
def __init__(self, random_rate):
|
||||
"""
|
||||
|
||||
:param random_rate: 随机系数
|
||||
"""
|
||||
self.random_rate = random_rate
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
if random.random() > self.random_rate:
|
||||
return data
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
|
||||
flip_text_polys = text_polys.copy()
|
||||
flip_im = cv2.flip(im, 0)
|
||||
h, w, _ = flip_im.shape
|
||||
flip_text_polys[:, :, 1] = h - flip_text_polys[:, :, 1]
|
||||
data["img"] = flip_im
|
||||
data["text_polys"] = flip_text_polys
|
||||
return data
|
||||
@ -0,0 +1,68 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/4 18:06
|
||||
# @Author : zhoujun
|
||||
import numpy as np
|
||||
import imgaug
|
||||
import imgaug.augmenters as iaa
|
||||
|
||||
|
||||
class AugmenterBuilder(object):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def build(self, args, root=True):
|
||||
if args is None or len(args) == 0:
|
||||
return None
|
||||
elif isinstance(args, list):
|
||||
if root:
|
||||
sequence = [self.build(value, root=False) for value in args]
|
||||
return iaa.Sequential(sequence)
|
||||
else:
|
||||
return getattr(iaa, args[0])(
|
||||
*[self.to_tuple_if_list(a) for a in args[1:]]
|
||||
)
|
||||
elif isinstance(args, dict):
|
||||
cls = getattr(iaa, args["type"])
|
||||
return cls(**{k: self.to_tuple_if_list(v) for k, v in args["args"].items()})
|
||||
else:
|
||||
raise RuntimeError("unknown augmenter arg: " + str(args))
|
||||
|
||||
def to_tuple_if_list(self, obj):
|
||||
if isinstance(obj, list):
|
||||
return tuple(obj)
|
||||
return obj
|
||||
|
||||
|
||||
class IaaAugment:
|
||||
def __init__(self, augmenter_args):
|
||||
self.augmenter_args = augmenter_args
|
||||
self.augmenter = AugmenterBuilder().build(self.augmenter_args)
|
||||
|
||||
def __call__(self, data):
|
||||
image = data["img"]
|
||||
shape = image.shape
|
||||
|
||||
if self.augmenter:
|
||||
aug = self.augmenter.to_deterministic()
|
||||
data["img"] = aug.augment_image(image)
|
||||
data = self.may_augment_annotation(aug, data, shape)
|
||||
return data
|
||||
|
||||
def may_augment_annotation(self, aug, data, shape):
|
||||
if aug is None:
|
||||
return data
|
||||
|
||||
line_polys = []
|
||||
for poly in data["text_polys"]:
|
||||
new_poly = self.may_augment_poly(aug, shape, poly)
|
||||
line_polys.append(new_poly)
|
||||
data["text_polys"] = np.array(line_polys)
|
||||
return data
|
||||
|
||||
def may_augment_poly(self, aug, img_shape, poly):
|
||||
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
|
||||
keypoints = aug.augment_keypoints(
|
||||
[imgaug.KeypointsOnImage(keypoints, shape=img_shape)]
|
||||
)[0].keypoints
|
||||
poly = [(p.x, p.y) for p in keypoints]
|
||||
return poly
|
||||
@ -0,0 +1,159 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
np.seterr(divide="ignore", invalid="ignore")
|
||||
import pyclipper
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
|
||||
class MakeBorderMap:
|
||||
def __init__(self, shrink_ratio=0.4, thresh_min=0.3, thresh_max=0.7):
|
||||
self.shrink_ratio = shrink_ratio
|
||||
self.thresh_min = thresh_min
|
||||
self.thresh_max = thresh_max
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
ignore_tags = data["ignore_tags"]
|
||||
|
||||
canvas = np.zeros(im.shape[:2], dtype=np.float32)
|
||||
mask = np.zeros(im.shape[:2], dtype=np.float32)
|
||||
|
||||
for i in range(len(text_polys)):
|
||||
if ignore_tags[i]:
|
||||
continue
|
||||
self.draw_border_map(text_polys[i], canvas, mask=mask)
|
||||
canvas = canvas * (self.thresh_max - self.thresh_min) + self.thresh_min
|
||||
|
||||
data["threshold_map"] = canvas
|
||||
data["threshold_mask"] = mask
|
||||
return data
|
||||
|
||||
def draw_border_map(self, polygon, canvas, mask):
|
||||
polygon = np.array(polygon)
|
||||
assert polygon.ndim == 2
|
||||
assert polygon.shape[1] == 2
|
||||
|
||||
polygon_shape = Polygon(polygon)
|
||||
if polygon_shape.area <= 0:
|
||||
return
|
||||
distance = (
|
||||
polygon_shape.area
|
||||
* (1 - np.power(self.shrink_ratio, 2))
|
||||
/ polygon_shape.length
|
||||
)
|
||||
subject = [tuple(l) for l in polygon]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
|
||||
padded_polygon = np.array(padding.Execute(distance)[0])
|
||||
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
|
||||
|
||||
xmin = padded_polygon[:, 0].min()
|
||||
xmax = padded_polygon[:, 0].max()
|
||||
ymin = padded_polygon[:, 1].min()
|
||||
ymax = padded_polygon[:, 1].max()
|
||||
width = xmax - xmin + 1
|
||||
height = ymax - ymin + 1
|
||||
|
||||
polygon[:, 0] = polygon[:, 0] - xmin
|
||||
polygon[:, 1] = polygon[:, 1] - ymin
|
||||
|
||||
xs = np.broadcast_to(
|
||||
np.linspace(0, width - 1, num=width).reshape(1, width), (height, width)
|
||||
)
|
||||
ys = np.broadcast_to(
|
||||
np.linspace(0, height - 1, num=height).reshape(height, 1), (height, width)
|
||||
)
|
||||
|
||||
distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
|
||||
for i in range(polygon.shape[0]):
|
||||
j = (i + 1) % polygon.shape[0]
|
||||
absolute_distance = self.distance(xs, ys, polygon[i], polygon[j])
|
||||
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
|
||||
distance_map = distance_map.min(axis=0)
|
||||
|
||||
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
|
||||
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
|
||||
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
|
||||
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
|
||||
canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1] = np.fmax(
|
||||
1
|
||||
- distance_map[
|
||||
ymin_valid - ymin : ymax_valid - ymax + height,
|
||||
xmin_valid - xmin : xmax_valid - xmax + width,
|
||||
],
|
||||
canvas[ymin_valid : ymax_valid + 1, xmin_valid : xmax_valid + 1],
|
||||
)
|
||||
|
||||
def distance(self, xs, ys, point_1, point_2):
|
||||
"""
|
||||
compute the distance from point to a line
|
||||
ys: coordinates in the first axis
|
||||
xs: coordinates in the second axis
|
||||
point_1, point_2: (x, y), the end of the line
|
||||
"""
|
||||
height, width = xs.shape[:2]
|
||||
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
|
||||
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
|
||||
square_distance = np.square(point_1[0] - point_2[0]) + np.square(
|
||||
point_1[1] - point_2[1]
|
||||
)
|
||||
|
||||
cosin = (square_distance - square_distance_1 - square_distance_2) / (
|
||||
2 * np.sqrt(square_distance_1 * square_distance_2)
|
||||
)
|
||||
square_sin = 1 - np.square(cosin)
|
||||
square_sin = np.nan_to_num(square_sin)
|
||||
|
||||
result = np.sqrt(
|
||||
square_distance_1 * square_distance_2 * square_sin / square_distance
|
||||
)
|
||||
result[cosin < 0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[
|
||||
cosin < 0
|
||||
]
|
||||
# self.extend_line(point_1, point_2, result)
|
||||
return result
|
||||
|
||||
def extend_line(self, point_1, point_2, result):
|
||||
ex_point_1 = (
|
||||
int(
|
||||
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + self.shrink_ratio))
|
||||
),
|
||||
int(
|
||||
round(point_1[1] + (point_1[1] - point_2[1]) * (1 + self.shrink_ratio))
|
||||
),
|
||||
)
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_1),
|
||||
tuple(point_1),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0,
|
||||
)
|
||||
ex_point_2 = (
|
||||
int(
|
||||
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + self.shrink_ratio))
|
||||
),
|
||||
int(
|
||||
round(point_2[1] + (point_2[1] - point_1[1]) * (1 + self.shrink_ratio))
|
||||
),
|
||||
)
|
||||
cv2.line(
|
||||
result,
|
||||
tuple(ex_point_2),
|
||||
tuple(point_2),
|
||||
4096.0,
|
||||
1,
|
||||
lineType=cv2.LINE_AA,
|
||||
shift=0,
|
||||
)
|
||||
return ex_point_1, ex_point_2
|
||||
@ -0,0 +1,129 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
|
||||
def shrink_polygon_py(polygon, shrink_ratio):
|
||||
"""
|
||||
对框进行缩放,返回去的比例为1/shrink_ratio 即可
|
||||
"""
|
||||
cx = polygon[:, 0].mean()
|
||||
cy = polygon[:, 1].mean()
|
||||
polygon[:, 0] = cx + (polygon[:, 0] - cx) * shrink_ratio
|
||||
polygon[:, 1] = cy + (polygon[:, 1] - cy) * shrink_ratio
|
||||
return polygon
|
||||
|
||||
|
||||
def shrink_polygon_pyclipper(polygon, shrink_ratio):
|
||||
from shapely.geometry import Polygon
|
||||
import pyclipper
|
||||
|
||||
polygon_shape = Polygon(polygon)
|
||||
distance = (
|
||||
polygon_shape.area * (1 - np.power(shrink_ratio, 2)) / polygon_shape.length
|
||||
)
|
||||
subject = [tuple(l) for l in polygon]
|
||||
padding = pyclipper.PyclipperOffset()
|
||||
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
shrunk = padding.Execute(-distance)
|
||||
if shrunk == []:
|
||||
shrunk = np.array(shrunk)
|
||||
else:
|
||||
shrunk = np.array(shrunk[0]).reshape(-1, 2)
|
||||
return shrunk
|
||||
|
||||
|
||||
class MakeShrinkMap:
|
||||
r"""
|
||||
Making binary mask from detection data with ICDAR format.
|
||||
Typically following the process of class `MakeICDARData`.
|
||||
"""
|
||||
|
||||
def __init__(self, min_text_size=8, shrink_ratio=0.4, shrink_type="pyclipper"):
|
||||
shrink_func_dict = {
|
||||
"py": shrink_polygon_py,
|
||||
"pyclipper": shrink_polygon_pyclipper,
|
||||
}
|
||||
self.shrink_func = shrink_func_dict[shrink_type]
|
||||
self.min_text_size = min_text_size
|
||||
self.shrink_ratio = shrink_ratio
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
image = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
ignore_tags = data["ignore_tags"]
|
||||
|
||||
h, w = image.shape[:2]
|
||||
text_polys, ignore_tags = self.validate_polygons(text_polys, ignore_tags, h, w)
|
||||
gt = np.zeros((h, w), dtype=np.float32)
|
||||
mask = np.ones((h, w), dtype=np.float32)
|
||||
for i in range(len(text_polys)):
|
||||
polygon = text_polys[i]
|
||||
height = max(polygon[:, 1]) - min(polygon[:, 1])
|
||||
width = max(polygon[:, 0]) - min(polygon[:, 0])
|
||||
if ignore_tags[i] or min(height, width) < self.min_text_size:
|
||||
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
else:
|
||||
shrunk = self.shrink_func(polygon, self.shrink_ratio)
|
||||
if shrunk.size == 0:
|
||||
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
|
||||
ignore_tags[i] = True
|
||||
continue
|
||||
cv2.fillPoly(gt, [shrunk.astype(np.int32)], 1)
|
||||
|
||||
data["shrink_map"] = gt
|
||||
data["shrink_mask"] = mask
|
||||
return data
|
||||
|
||||
def validate_polygons(self, polygons, ignore_tags, h, w):
|
||||
"""
|
||||
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
|
||||
"""
|
||||
if len(polygons) == 0:
|
||||
return polygons, ignore_tags
|
||||
assert len(polygons) == len(ignore_tags)
|
||||
for polygon in polygons:
|
||||
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
|
||||
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
|
||||
|
||||
for i in range(len(polygons)):
|
||||
area = self.polygon_area(polygons[i])
|
||||
if abs(area) < 1:
|
||||
ignore_tags[i] = True
|
||||
if area > 0:
|
||||
polygons[i] = polygons[i][::-1, :]
|
||||
return polygons, ignore_tags
|
||||
|
||||
def polygon_area(self, polygon):
|
||||
return cv2.contourArea(polygon)
|
||||
# edge = 0
|
||||
# for i in range(polygon.shape[0]):
|
||||
# next_index = (i + 1) % polygon.shape[0]
|
||||
# edge += (polygon[next_index, 0] - polygon[i, 0]) * (polygon[next_index, 1] - polygon[i, 1])
|
||||
#
|
||||
# return edge / 2.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from shapely.geometry import Polygon
|
||||
import pyclipper
|
||||
|
||||
polygon = np.array([[0, 0], [100, 10], [100, 100], [10, 90]])
|
||||
a = shrink_polygon_py(polygon, 0.4)
|
||||
print(a)
|
||||
print(shrink_polygon_py(a, 1 / 0.4))
|
||||
b = shrink_polygon_pyclipper(polygon, 0.4)
|
||||
print(b)
|
||||
poly = Polygon(b)
|
||||
distance = poly.area * 1.5 / poly.length
|
||||
offset = pyclipper.PyclipperOffset()
|
||||
offset.AddPath(b, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
expanded = np.array(offset.Execute(distance))
|
||||
bounding_box = cv2.minAreaRect(expanded)
|
||||
points = cv2.boxPoints(bounding_box)
|
||||
print(points)
|
||||
@ -0,0 +1,211 @@
|
||||
import random
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
# random crop algorithm similar to https://github.com/argman/EAST
|
||||
class EastRandomCropData:
|
||||
def __init__(
|
||||
self,
|
||||
size=(640, 640),
|
||||
max_tries=50,
|
||||
min_crop_side_ratio=0.1,
|
||||
require_original_image=False,
|
||||
keep_ratio=True,
|
||||
):
|
||||
self.size = size
|
||||
self.max_tries = max_tries
|
||||
self.min_crop_side_ratio = min_crop_side_ratio
|
||||
self.require_original_image = require_original_image
|
||||
self.keep_ratio = keep_ratio
|
||||
|
||||
def __call__(self, data: dict) -> dict:
|
||||
"""
|
||||
从scales中随机选择一个尺度,对图片和文本框进行缩放
|
||||
:param data: {'img':,'text_polys':,'texts':,'ignore_tags':}
|
||||
:return:
|
||||
"""
|
||||
im = data["img"]
|
||||
text_polys = data["text_polys"]
|
||||
ignore_tags = data["ignore_tags"]
|
||||
texts = data["texts"]
|
||||
all_care_polys = [text_polys[i] for i, tag in enumerate(ignore_tags) if not tag]
|
||||
# 计算crop区域
|
||||
crop_x, crop_y, crop_w, crop_h = self.crop_area(im, all_care_polys)
|
||||
# crop 图片 保持比例填充
|
||||
scale_w = self.size[0] / crop_w
|
||||
scale_h = self.size[1] / crop_h
|
||||
scale = min(scale_w, scale_h)
|
||||
h = int(crop_h * scale)
|
||||
w = int(crop_w * scale)
|
||||
if self.keep_ratio:
|
||||
if len(im.shape) == 3:
|
||||
padimg = np.zeros((self.size[1], self.size[0], im.shape[2]), im.dtype)
|
||||
else:
|
||||
padimg = np.zeros((self.size[1], self.size[0]), im.dtype)
|
||||
padimg[:h, :w] = cv2.resize(
|
||||
im[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w], (w, h)
|
||||
)
|
||||
img = padimg
|
||||
else:
|
||||
img = cv2.resize(
|
||||
im[crop_y : crop_y + crop_h, crop_x : crop_x + crop_w], tuple(self.size)
|
||||
)
|
||||
# crop 文本框
|
||||
text_polys_crop = []
|
||||
ignore_tags_crop = []
|
||||
texts_crop = []
|
||||
for poly, text, tag in zip(text_polys, texts, ignore_tags):
|
||||
poly = ((poly - (crop_x, crop_y)) * scale).tolist()
|
||||
if not self.is_poly_outside_rect(poly, 0, 0, w, h):
|
||||
text_polys_crop.append(poly)
|
||||
ignore_tags_crop.append(tag)
|
||||
texts_crop.append(text)
|
||||
data["img"] = img
|
||||
data["text_polys"] = np.float32(text_polys_crop)
|
||||
data["ignore_tags"] = ignore_tags_crop
|
||||
data["texts"] = texts_crop
|
||||
return data
|
||||
|
||||
def is_poly_in_rect(self, poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
|
||||
return False
|
||||
if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_poly_outside_rect(self, poly, x, y, w, h):
|
||||
poly = np.array(poly)
|
||||
if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
|
||||
return True
|
||||
if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
|
||||
return True
|
||||
return False
|
||||
|
||||
def split_regions(self, axis):
|
||||
regions = []
|
||||
min_axis = 0
|
||||
for i in range(1, axis.shape[0]):
|
||||
if axis[i] != axis[i - 1] + 1:
|
||||
region = axis[min_axis:i]
|
||||
min_axis = i
|
||||
regions.append(region)
|
||||
return regions
|
||||
|
||||
def random_select(self, axis, max_size):
|
||||
xx = np.random.choice(axis, size=2)
|
||||
xmin = np.min(xx)
|
||||
xmax = np.max(xx)
|
||||
xmin = np.clip(xmin, 0, max_size - 1)
|
||||
xmax = np.clip(xmax, 0, max_size - 1)
|
||||
return xmin, xmax
|
||||
|
||||
def region_wise_random_select(self, regions, max_size):
|
||||
selected_index = list(np.random.choice(len(regions), 2))
|
||||
selected_values = []
|
||||
for index in selected_index:
|
||||
axis = regions[index]
|
||||
xx = int(np.random.choice(axis, size=1))
|
||||
selected_values.append(xx)
|
||||
xmin = min(selected_values)
|
||||
xmax = max(selected_values)
|
||||
return xmin, xmax
|
||||
|
||||
def crop_area(self, im, text_polys):
|
||||
h, w = im.shape[:2]
|
||||
h_array = np.zeros(h, dtype=np.int32)
|
||||
w_array = np.zeros(w, dtype=np.int32)
|
||||
for points in text_polys:
|
||||
points = np.round(points, decimals=0).astype(np.int32)
|
||||
minx = np.min(points[:, 0])
|
||||
maxx = np.max(points[:, 0])
|
||||
w_array[minx:maxx] = 1
|
||||
miny = np.min(points[:, 1])
|
||||
maxy = np.max(points[:, 1])
|
||||
h_array[miny:maxy] = 1
|
||||
# ensure the cropped area not across a text
|
||||
h_axis = np.where(h_array == 0)[0]
|
||||
w_axis = np.where(w_array == 0)[0]
|
||||
|
||||
if len(h_axis) == 0 or len(w_axis) == 0:
|
||||
return 0, 0, w, h
|
||||
|
||||
h_regions = self.split_regions(h_axis)
|
||||
w_regions = self.split_regions(w_axis)
|
||||
|
||||
for i in range(self.max_tries):
|
||||
if len(w_regions) > 1:
|
||||
xmin, xmax = self.region_wise_random_select(w_regions, w)
|
||||
else:
|
||||
xmin, xmax = self.random_select(w_axis, w)
|
||||
if len(h_regions) > 1:
|
||||
ymin, ymax = self.region_wise_random_select(h_regions, h)
|
||||
else:
|
||||
ymin, ymax = self.random_select(h_axis, h)
|
||||
|
||||
if (
|
||||
xmax - xmin < self.min_crop_side_ratio * w
|
||||
or ymax - ymin < self.min_crop_side_ratio * h
|
||||
):
|
||||
# area too small
|
||||
continue
|
||||
num_poly_in_rect = 0
|
||||
for poly in text_polys:
|
||||
if not self.is_poly_outside_rect(
|
||||
poly, xmin, ymin, xmax - xmin, ymax - ymin
|
||||
):
|
||||
num_poly_in_rect += 1
|
||||
break
|
||||
|
||||
if num_poly_in_rect > 0:
|
||||
return xmin, ymin, xmax - xmin, ymax - ymin
|
||||
|
||||
return 0, 0, w, h
|
||||
|
||||
|
||||
class PSERandomCrop:
|
||||
def __init__(self, size):
|
||||
self.size = size
|
||||
|
||||
def __call__(self, data):
|
||||
imgs = data["imgs"]
|
||||
|
||||
h, w = imgs[0].shape[0:2]
|
||||
th, tw = self.size
|
||||
if w == tw and h == th:
|
||||
return imgs
|
||||
|
||||
# label中存在文本实例,并且按照概率进行裁剪,使用threshold_label_map控制
|
||||
if np.max(imgs[2]) > 0 and random.random() > 3 / 8:
|
||||
# 文本实例的左上角点
|
||||
tl = np.min(np.where(imgs[2] > 0), axis=1) - self.size
|
||||
tl[tl < 0] = 0
|
||||
# 文本实例的右下角点
|
||||
br = np.max(np.where(imgs[2] > 0), axis=1) - self.size
|
||||
br[br < 0] = 0
|
||||
# 保证选到右下角点时,有足够的距离进行crop
|
||||
br[0] = min(br[0], h - th)
|
||||
br[1] = min(br[1], w - tw)
|
||||
|
||||
for _ in range(50000):
|
||||
i = random.randint(tl[0], br[0])
|
||||
j = random.randint(tl[1], br[1])
|
||||
# 保证shrink_label_map有文本
|
||||
if imgs[1][i : i + th, j : j + tw].sum() <= 0:
|
||||
continue
|
||||
else:
|
||||
break
|
||||
else:
|
||||
i = random.randint(0, h - th)
|
||||
j = random.randint(0, w - tw)
|
||||
|
||||
# return i, j, th, tw
|
||||
for idx in range(len(imgs)):
|
||||
if len(imgs[idx].shape) == 3:
|
||||
imgs[idx] = imgs[idx][i : i + th, j : j + tw, :]
|
||||
else:
|
||||
imgs[idx] = imgs[idx][i : i + th, j : j + tw]
|
||||
data["imgs"] = imgs
|
||||
return data
|
||||
21
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/environment.yml
Normal file
21
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/environment.yml
Normal file
@ -0,0 +1,21 @@
|
||||
name: dbnet
|
||||
channels:
|
||||
- conda-forge
|
||||
- defaults
|
||||
dependencies:
|
||||
- anyconfig==0.9.10
|
||||
- future==0.18.2
|
||||
- imgaug==0.4.0
|
||||
- matplotlib==3.1.2
|
||||
- numpy==1.17.4
|
||||
- opencv
|
||||
- pyclipper
|
||||
- PyYAML==5.2
|
||||
- scikit-image==0.16.2
|
||||
- Shapely==1.6.4
|
||||
- tensorboard=2
|
||||
- tqdm==4.40.1
|
||||
- ipython
|
||||
- pip
|
||||
- pip:
|
||||
- polygon3
|
||||
1
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/eval.sh
Normal file
1
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/eval.sh
Normal file
@ -0,0 +1 @@
|
||||
CUDA_VISIBLE_DEVICES=0 python3 tools/eval.py --model_path ''
|
||||
17
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/generate_lists.sh
Normal file
17
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/generate_lists.sh
Normal file
@ -0,0 +1,17 @@
|
||||
#Only use if your file names of the images and txts are identical
|
||||
rm ./datasets/train_img.txt
|
||||
rm ./datasets/train_gt.txt
|
||||
rm ./datasets/test_img.txt
|
||||
rm ./datasets/test_gt.txt
|
||||
rm ./datasets/train.txt
|
||||
rm ./datasets/test.txt
|
||||
ls ./datasets/train/img/*.jpg > ./datasets/train_img.txt
|
||||
ls ./datasets/train/gt/*.txt > ./datasets/train_gt.txt
|
||||
ls ./datasets/test/img/*.jpg > ./datasets/test_img.txt
|
||||
ls ./datasets/test/gt/*.txt > ./datasets/test_gt.txt
|
||||
paste ./datasets/train_img.txt ./datasets/train_gt.txt > ./datasets/train.txt
|
||||
paste ./datasets/test_img.txt ./datasets/test_gt.txt > ./datasets/test.txt
|
||||
rm ./datasets/train_img.txt
|
||||
rm ./datasets/train_gt.txt
|
||||
rm ./datasets/test_img.txt
|
||||
rm ./datasets/test_gt.txt
|
||||
22
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/__init__.py
Normal file
22
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/__init__.py
Normal file
@ -0,0 +1,22 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:55
|
||||
# @Author : zhoujun
|
||||
import copy
|
||||
from .model import Model
|
||||
from .losses import build_loss
|
||||
|
||||
__all__ = ["build_loss", "build_model"]
|
||||
support_model = ["Model"]
|
||||
|
||||
|
||||
def build_model(config):
|
||||
"""
|
||||
get architecture model class
|
||||
"""
|
||||
copy_config = copy.deepcopy(config)
|
||||
arch_type = copy_config.pop("type")
|
||||
assert (
|
||||
arch_type in support_model
|
||||
), f"{arch_type} is not developed yet!, only {support_model} are support now"
|
||||
arch_model = eval(arch_type)(copy_config)
|
||||
return arch_model
|
||||
@ -0,0 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:54
|
||||
# @Author : zhoujun
|
||||
|
||||
from .resnet import *
|
||||
|
||||
__all__ = ["build_backbone"]
|
||||
|
||||
support_backbone = [
|
||||
"resnet18",
|
||||
"deformable_resnet18",
|
||||
"deformable_resnet50",
|
||||
"resnet50",
|
||||
"resnet34",
|
||||
"resnet101",
|
||||
"resnet152",
|
||||
]
|
||||
|
||||
|
||||
def build_backbone(backbone_name, **kwargs):
|
||||
assert (
|
||||
backbone_name in support_backbone
|
||||
), f"all support backbone is {support_backbone}"
|
||||
backbone = eval(backbone_name)(**kwargs)
|
||||
return backbone
|
||||
@ -0,0 +1,366 @@
|
||||
import math
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
BatchNorm2d = nn.BatchNorm2D
|
||||
|
||||
__all__ = [
|
||||
"ResNet",
|
||||
"resnet18",
|
||||
"resnet34",
|
||||
"resnet50",
|
||||
"resnet101",
|
||||
"deformable_resnet18",
|
||||
"deformable_resnet50",
|
||||
"resnet152",
|
||||
]
|
||||
|
||||
model_urls = {
|
||||
"resnet18": "https://download.pytorch.org/models/resnet18-5c106cde.pth",
|
||||
"resnet34": "https://download.pytorch.org/models/resnet34-333f7ec4.pth",
|
||||
"resnet50": "https://download.pytorch.org/models/resnet50-19c8e357.pth",
|
||||
"resnet101": "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth",
|
||||
"resnet152": "https://download.pytorch.org/models/resnet152-b121ed2d.pth",
|
||||
}
|
||||
|
||||
|
||||
def constant_init(module, constant, bias=0):
|
||||
module.weight = paddle.create_parameter(
|
||||
shape=module.weight.shape,
|
||||
dtype="float32",
|
||||
default_initializer=paddle.nn.initializer.Constant(constant),
|
||||
)
|
||||
if hasattr(module, "bias"):
|
||||
module.bias = paddle.create_parameter(
|
||||
shape=module.bias.shape,
|
||||
dtype="float32",
|
||||
default_initializer=paddle.nn.initializer.Constant(bias),
|
||||
)
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1):
|
||||
"""3x3 convolution with padding"""
|
||||
return nn.Conv2D(
|
||||
in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias_attr=False
|
||||
)
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.with_dcn = dcn is not None
|
||||
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||
self.bn1 = BatchNorm2d(planes, momentum=0.1)
|
||||
self.relu = nn.ReLU()
|
||||
self.with_modulated_dcn = False
|
||||
if not self.with_dcn:
|
||||
self.conv2 = nn.Conv2D(
|
||||
planes, planes, kernel_size=3, padding=1, bias_attr=False
|
||||
)
|
||||
else:
|
||||
from paddle.vision.ops import DeformConv2D
|
||||
|
||||
deformable_groups = dcn.get("deformable_groups", 1)
|
||||
offset_channels = 18
|
||||
self.conv2_offset = nn.Conv2D(
|
||||
planes, deformable_groups * offset_channels, kernel_size=3, padding=1
|
||||
)
|
||||
self.conv2 = DeformConv2D(
|
||||
planes, planes, kernel_size=3, padding=1, bias_attr=False
|
||||
)
|
||||
self.bn2 = BatchNorm2d(planes, momentum=0.1)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
# out = self.conv2(out)
|
||||
if not self.with_dcn:
|
||||
out = self.conv2(out)
|
||||
else:
|
||||
offset = self.conv2_offset(out)
|
||||
out = self.conv2(out, offset)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Bottleneck(nn.Layer):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
|
||||
super(Bottleneck, self).__init__()
|
||||
self.with_dcn = dcn is not None
|
||||
self.conv1 = nn.Conv2D(inplanes, planes, kernel_size=1, bias_attr=False)
|
||||
self.bn1 = BatchNorm2d(planes, momentum=0.1)
|
||||
self.with_modulated_dcn = False
|
||||
if not self.with_dcn:
|
||||
self.conv2 = nn.Conv2D(
|
||||
planes, planes, kernel_size=3, stride=stride, padding=1, bias_attr=False
|
||||
)
|
||||
else:
|
||||
deformable_groups = dcn.get("deformable_groups", 1)
|
||||
from paddle.vision.ops import DeformConv2D
|
||||
|
||||
offset_channels = 18
|
||||
self.conv2_offset = nn.Conv2D(
|
||||
planes,
|
||||
deformable_groups * offset_channels,
|
||||
stride=stride,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
)
|
||||
self.conv2 = DeformConv2D(
|
||||
planes, planes, kernel_size=3, padding=1, stride=stride, bias_attr=False
|
||||
)
|
||||
self.bn2 = BatchNorm2d(planes, momentum=0.1)
|
||||
self.conv3 = nn.Conv2D(planes, planes * 4, kernel_size=1, bias_attr=False)
|
||||
self.bn3 = BatchNorm2d(planes * 4, momentum=0.1)
|
||||
self.relu = nn.ReLU()
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
self.dcn = dcn
|
||||
self.with_dcn = dcn is not None
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
# out = self.conv2(out)
|
||||
if not self.with_dcn:
|
||||
out = self.conv2(out)
|
||||
else:
|
||||
offset = self.conv2_offset(out)
|
||||
out = self.conv2(out, offset)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResNet(nn.Layer):
|
||||
def __init__(self, block, layers, in_channels=3, dcn=None):
|
||||
self.dcn = dcn
|
||||
self.inplanes = 64
|
||||
super(ResNet, self).__init__()
|
||||
self.out_channels = []
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels, 64, kernel_size=7, stride=2, padding=3, bias_attr=False
|
||||
)
|
||||
self.bn1 = BatchNorm2d(64, momentum=0.1)
|
||||
self.relu = nn.ReLU()
|
||||
self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
self.layer1 = self._make_layer(block, 64, layers[0])
|
||||
self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dcn=dcn)
|
||||
self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dcn=dcn)
|
||||
self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dcn=dcn)
|
||||
|
||||
if self.dcn is not None:
|
||||
for m in self.modules():
|
||||
if isinstance(m, Bottleneck) or isinstance(m, BasicBlock):
|
||||
if hasattr(m, "conv2_offset"):
|
||||
constant_init(m.conv2_offset, 0)
|
||||
|
||||
def _make_layer(self, block, planes, blocks, stride=1, dcn=None):
|
||||
downsample = None
|
||||
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||
downsample = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
self.inplanes,
|
||||
planes * block.expansion,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias_attr=False,
|
||||
),
|
||||
BatchNorm2d(planes * block.expansion, momentum=0.1),
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(block(self.inplanes, planes, stride, downsample, dcn=dcn))
|
||||
self.inplanes = planes * block.expansion
|
||||
for i in range(1, blocks):
|
||||
layers.append(block(self.inplanes, planes, dcn=dcn))
|
||||
self.out_channels.append(planes * block.expansion)
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.bn1(x)
|
||||
x = self.relu(x)
|
||||
x = self.maxpool(x)
|
||||
|
||||
x2 = self.layer1(x)
|
||||
x3 = self.layer2(x2)
|
||||
x4 = self.layer3(x3)
|
||||
x5 = self.layer4(x4)
|
||||
|
||||
return x2, x3, x4, x5
|
||||
|
||||
|
||||
def load_torch_params(paddle_model, torch_patams):
|
||||
paddle_params = paddle_model.state_dict()
|
||||
|
||||
fc_names = ["classifier"]
|
||||
for key, torch_value in torch_patams.items():
|
||||
if "num_batches_tracked" in key:
|
||||
continue
|
||||
key = (
|
||||
key.replace("running_var", "_variance")
|
||||
.replace("running_mean", "_mean")
|
||||
.replace("module.", "")
|
||||
)
|
||||
torch_value = torch_value.detach().cpu().numpy()
|
||||
if key in paddle_params:
|
||||
flag = [i in key for i in fc_names]
|
||||
if any(flag) and "weight" in key: # ignore bias
|
||||
new_shape = [1, 0] + list(range(2, torch_value.ndim))
|
||||
print(
|
||||
f"name: {key}, ori shape: {torch_value.shape}, new shape: {torch_value.transpose(new_shape).shape}"
|
||||
)
|
||||
torch_value = torch_value.transpose(new_shape)
|
||||
paddle_params[key] = torch_value
|
||||
else:
|
||||
print(f"{key} not in paddle")
|
||||
paddle_model.set_state_dict(paddle_params)
|
||||
|
||||
|
||||
def load_models(model, model_name):
|
||||
import torch.utils.model_zoo as model_zoo
|
||||
|
||||
torch_patams = model_zoo.load_url(model_urls[model_name])
|
||||
load_torch_params(model, torch_patams)
|
||||
|
||||
|
||||
def resnet18(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-18 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
|
||||
if pretrained:
|
||||
assert (
|
||||
kwargs.get("in_channels", 3) == 3
|
||||
), "in_channels must be 3 when pretrained is True"
|
||||
print("load from imagenet")
|
||||
load_models(model, "resnet18")
|
||||
return model
|
||||
|
||||
|
||||
def deformable_resnet18(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-18 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(BasicBlock, [2, 2, 2, 2], dcn=dict(deformable_groups=1), **kwargs)
|
||||
if pretrained:
|
||||
assert (
|
||||
kwargs.get("in_channels", 3) == 3
|
||||
), "in_channels must be 3 when pretrained is True"
|
||||
print("load from imagenet")
|
||||
model.load_state_dict(model_zoo.load_url(model_urls["resnet18"]), strict=False)
|
||||
return model
|
||||
|
||||
|
||||
def resnet34(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-34 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
|
||||
if pretrained:
|
||||
assert (
|
||||
kwargs.get("in_channels", 3) == 3
|
||||
), "in_channels must be 3 when pretrained is True"
|
||||
model.load_state_dict(model_zoo.load_url(model_urls["resnet34"]), strict=False)
|
||||
return model
|
||||
|
||||
|
||||
def resnet50(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-50 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
|
||||
if pretrained:
|
||||
assert (
|
||||
kwargs.get("in_channels", 3) == 3
|
||||
), "in_channels must be 3 when pretrained is True"
|
||||
load_models(model, "resnet50")
|
||||
return model
|
||||
|
||||
|
||||
def deformable_resnet50(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-50 model with deformable conv.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 4, 6, 3], dcn=dict(deformable_groups=1), **kwargs)
|
||||
if pretrained:
|
||||
assert (
|
||||
kwargs.get("in_channels", 3) == 3
|
||||
), "in_channels must be 3 when pretrained is True"
|
||||
model.load_state_dict(model_zoo.load_url(model_urls["resnet50"]), strict=False)
|
||||
return model
|
||||
|
||||
|
||||
def resnet101(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-101 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
|
||||
if pretrained:
|
||||
assert (
|
||||
kwargs.get("in_channels", 3) == 3
|
||||
), "in_channels must be 3 when pretrained is True"
|
||||
model.load_state_dict(model_zoo.load_url(model_urls["resnet101"]), strict=False)
|
||||
return model
|
||||
|
||||
|
||||
def resnet152(pretrained=True, **kwargs):
|
||||
"""Constructs a ResNet-152 model.
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
|
||||
if pretrained:
|
||||
assert (
|
||||
kwargs.get("in_channels", 3) == 3
|
||||
), "in_channels must be 3 when pretrained is True"
|
||||
model.load_state_dict(model_zoo.load_url(model_urls["resnet152"]), strict=False)
|
||||
return model
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
x = paddle.zeros([2, 3, 640, 640])
|
||||
net = resnet50(pretrained=True)
|
||||
y = net(x)
|
||||
for u in y:
|
||||
print(u.shape)
|
||||
|
||||
print(net.out_channels)
|
||||
40
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/basic.py
Normal file
40
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/basic.py
Normal file
@ -0,0 +1,40 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/6 11:19
|
||||
# @Author : zhoujun
|
||||
from paddle import nn
|
||||
|
||||
|
||||
class ConvBnRelu(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=True,
|
||||
padding_mode="zeros",
|
||||
inplace=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
dilation=dilation,
|
||||
groups=groups,
|
||||
bias_attr=bias,
|
||||
padding_mode=padding_mode,
|
||||
)
|
||||
self.bn = nn.BatchNorm2D(out_channels)
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.relu(x)
|
||||
return x
|
||||
132
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/head/DBHead.py
Normal file
132
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/head/DBHead.py
Normal file
@ -0,0 +1,132 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/4 14:54
|
||||
# @Author : zhoujun
|
||||
import paddle
|
||||
from paddle import nn, ParamAttr
|
||||
|
||||
|
||||
class DBHead(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, k=50):
|
||||
super().__init__()
|
||||
self.k = k
|
||||
self.binarize = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
in_channels // 4,
|
||||
3,
|
||||
padding=1,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
|
||||
),
|
||||
nn.BatchNorm2D(
|
||||
in_channels // 4,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
|
||||
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
|
||||
),
|
||||
nn.ReLU(),
|
||||
nn.Conv2DTranspose(
|
||||
in_channels // 4,
|
||||
in_channels // 4,
|
||||
2,
|
||||
2,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
|
||||
),
|
||||
nn.BatchNorm2D(
|
||||
in_channels // 4,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
|
||||
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
|
||||
),
|
||||
nn.ReLU(),
|
||||
nn.Conv2DTranspose(
|
||||
in_channels // 4, 1, 2, 2, weight_attr=nn.initializer.KaimingNormal()
|
||||
),
|
||||
nn.Sigmoid(),
|
||||
)
|
||||
|
||||
self.thresh = self._init_thresh(in_channels)
|
||||
|
||||
def forward(self, x):
|
||||
shrink_maps = self.binarize(x)
|
||||
threshold_maps = self.thresh(x)
|
||||
if self.training:
|
||||
binary_maps = self.step_function(shrink_maps, threshold_maps)
|
||||
y = paddle.concat((shrink_maps, threshold_maps, binary_maps), axis=1)
|
||||
else:
|
||||
y = paddle.concat((shrink_maps, threshold_maps), axis=1)
|
||||
return y
|
||||
|
||||
def _init_thresh(self, inner_channels, serial=False, smooth=False, bias=False):
|
||||
in_channels = inner_channels
|
||||
if serial:
|
||||
in_channels += 1
|
||||
self.thresh = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
inner_channels // 4,
|
||||
3,
|
||||
padding=1,
|
||||
bias_attr=bias,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
|
||||
),
|
||||
nn.BatchNorm2D(
|
||||
inner_channels // 4,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
|
||||
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
|
||||
),
|
||||
nn.ReLU(),
|
||||
self._init_upsample(
|
||||
inner_channels // 4, inner_channels // 4, smooth=smooth, bias=bias
|
||||
),
|
||||
nn.BatchNorm2D(
|
||||
inner_channels // 4,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Constant(1)),
|
||||
bias_attr=ParamAttr(initializer=nn.initializer.Constant(1e-4)),
|
||||
),
|
||||
nn.ReLU(),
|
||||
self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
|
||||
nn.Sigmoid(),
|
||||
)
|
||||
return self.thresh
|
||||
|
||||
def _init_upsample(self, in_channels, out_channels, smooth=False, bias=False):
|
||||
if smooth:
|
||||
inter_out_channels = out_channels
|
||||
if out_channels == 1:
|
||||
inter_out_channels = in_channels
|
||||
module_list = [
|
||||
nn.Upsample(scale_factor=2, mode="nearest"),
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
inter_out_channels,
|
||||
3,
|
||||
1,
|
||||
1,
|
||||
bias_attr=bias,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
|
||||
),
|
||||
]
|
||||
if out_channels == 1:
|
||||
module_list.append(
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=1,
|
||||
bias_attr=True,
|
||||
weight_attr=ParamAttr(
|
||||
initializer=nn.initializer.KaimingNormal()
|
||||
),
|
||||
)
|
||||
)
|
||||
return nn.Sequential(module_list)
|
||||
else:
|
||||
return nn.Conv2DTranspose(
|
||||
in_channels,
|
||||
out_channels,
|
||||
2,
|
||||
2,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.KaimingNormal()),
|
||||
)
|
||||
|
||||
def step_function(self, x, y):
|
||||
return paddle.reciprocal(1 + paddle.exp(-self.k * (x - y)))
|
||||
@ -0,0 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2020/6/5 11:35
|
||||
# @Author : zhoujun
|
||||
from .DBHead import DBHead
|
||||
|
||||
__all__ = ["build_head"]
|
||||
support_head = ["DBHead"]
|
||||
|
||||
|
||||
def build_head(head_name, **kwargs):
|
||||
assert head_name in support_head, f"all support head is {support_head}"
|
||||
head = eval(head_name)(**kwargs)
|
||||
return head
|
||||
@ -0,0 +1,50 @@
|
||||
import paddle
|
||||
from models.losses.basic_loss import BalanceCrossEntropyLoss, MaskL1Loss, DiceLoss
|
||||
|
||||
|
||||
class DBLoss(paddle.nn.Layer):
|
||||
def __init__(self, alpha=1.0, beta=10, ohem_ratio=3, reduction="mean", eps=1e-06):
|
||||
"""
|
||||
Implement PSE Loss.
|
||||
:param alpha: binary_map loss 前面的系数
|
||||
:param beta: threshold_map loss 前面的系数
|
||||
:param ohem_ratio: OHEM的比例
|
||||
:param reduction: 'mean' or 'sum'对 batch里的loss 算均值或求和
|
||||
"""
|
||||
super().__init__()
|
||||
assert reduction in ["mean", "sum"], " reduction must in ['mean','sum']"
|
||||
self.alpha = alpha
|
||||
self.beta = beta
|
||||
self.bce_loss = BalanceCrossEntropyLoss(negative_ratio=ohem_ratio)
|
||||
self.dice_loss = DiceLoss(eps=eps)
|
||||
self.l1_loss = MaskL1Loss(eps=eps)
|
||||
self.ohem_ratio = ohem_ratio
|
||||
self.reduction = reduction
|
||||
|
||||
def forward(self, pred, batch):
|
||||
shrink_maps = pred[:, 0, :, :]
|
||||
threshold_maps = pred[:, 1, :, :]
|
||||
binary_maps = pred[:, 2, :, :]
|
||||
loss_shrink_maps = self.bce_loss(
|
||||
shrink_maps, batch["shrink_map"], batch["shrink_mask"]
|
||||
)
|
||||
loss_threshold_maps = self.l1_loss(
|
||||
threshold_maps, batch["threshold_map"], batch["threshold_mask"]
|
||||
)
|
||||
metrics = dict(
|
||||
loss_shrink_maps=loss_shrink_maps, loss_threshold_maps=loss_threshold_maps
|
||||
)
|
||||
if pred.shape[1] > 2:
|
||||
loss_binary_maps = self.dice_loss(
|
||||
binary_maps, batch["shrink_map"], batch["shrink_mask"]
|
||||
)
|
||||
metrics["loss_binary_maps"] = loss_binary_maps
|
||||
loss_all = (
|
||||
self.alpha * loss_shrink_maps
|
||||
+ self.beta * loss_threshold_maps
|
||||
+ loss_binary_maps
|
||||
)
|
||||
metrics["loss"] = loss_all
|
||||
else:
|
||||
metrics["loss"] = loss_shrink_maps
|
||||
return metrics
|
||||
@ -0,0 +1,16 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2020/6/5 11:36
|
||||
# @Author : zhoujun
|
||||
import copy
|
||||
from .DB_loss import DBLoss
|
||||
|
||||
__all__ = ["build_loss"]
|
||||
support_loss = ["DBLoss"]
|
||||
|
||||
|
||||
def build_loss(config):
|
||||
copy_config = copy.deepcopy(config)
|
||||
loss_type = copy_config.pop("type")
|
||||
assert loss_type in support_loss, f"all support loss is {support_loss}"
|
||||
criterion = eval(loss_type)(**copy_config)
|
||||
return criterion
|
||||
@ -0,0 +1,101 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/4 14:39
|
||||
# @Author : zhoujun
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
|
||||
class BalanceCrossEntropyLoss(nn.Layer):
|
||||
"""
|
||||
Balanced cross entropy loss.
|
||||
Shape:
|
||||
- Input: :math:`(N, 1, H, W)`
|
||||
- GT: :math:`(N, 1, H, W)`, same shape as the input
|
||||
- Mask: :math:`(N, H, W)`, same spatial shape as the input
|
||||
- Output: scalar.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, negative_ratio=3.0, eps=1e-6):
|
||||
super(BalanceCrossEntropyLoss, self).__init__()
|
||||
self.negative_ratio = negative_ratio
|
||||
self.eps = eps
|
||||
|
||||
def forward(
|
||||
self,
|
||||
pred: paddle.Tensor,
|
||||
gt: paddle.Tensor,
|
||||
mask: paddle.Tensor,
|
||||
return_origin=False,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
pred: shape :math:`(N, 1, H, W)`, the prediction of network
|
||||
gt: shape :math:`(N, 1, H, W)`, the target
|
||||
mask: shape :math:`(N, H, W)`, the mask indicates positive regions
|
||||
"""
|
||||
positive = gt * mask
|
||||
negative = (1 - gt) * mask
|
||||
positive_count = int(positive.sum())
|
||||
negative_count = min(
|
||||
int(negative.sum()), int(positive_count * self.negative_ratio)
|
||||
)
|
||||
loss = nn.functional.binary_cross_entropy(pred, gt, reduction="none")
|
||||
positive_loss = loss * positive
|
||||
negative_loss = loss * negative
|
||||
negative_loss, _ = negative_loss.reshape([-1]).topk(negative_count)
|
||||
|
||||
balance_loss = (positive_loss.sum() + negative_loss.sum()) / (
|
||||
positive_count + negative_count + self.eps
|
||||
)
|
||||
|
||||
if return_origin:
|
||||
return balance_loss, loss
|
||||
return balance_loss
|
||||
|
||||
|
||||
class DiceLoss(nn.Layer):
|
||||
"""
|
||||
Loss function from https://arxiv.org/abs/1707.03237,
|
||||
where iou computation is introduced heatmap manner to measure the
|
||||
diversity between tow heatmaps.
|
||||
"""
|
||||
|
||||
def __init__(self, eps=1e-6):
|
||||
super(DiceLoss, self).__init__()
|
||||
self.eps = eps
|
||||
|
||||
def forward(self, pred: paddle.Tensor, gt, mask, weights=None):
|
||||
"""
|
||||
pred: one or two heatmaps of shape (N, 1, H, W),
|
||||
the losses of tow heatmaps are added together.
|
||||
gt: (N, 1, H, W)
|
||||
mask: (N, H, W)
|
||||
"""
|
||||
return self._compute(pred, gt, mask, weights)
|
||||
|
||||
def _compute(self, pred, gt, mask, weights):
|
||||
if len(pred.shape) == 4:
|
||||
pred = pred[:, 0, :, :]
|
||||
gt = gt[:, 0, :, :]
|
||||
assert pred.shape == gt.shape
|
||||
assert pred.shape == mask.shape
|
||||
if weights is not None:
|
||||
assert weights.shape == mask.shape
|
||||
mask = weights * mask
|
||||
intersection = (pred * gt * mask).sum()
|
||||
|
||||
union = (pred * mask).sum() + (gt * mask).sum() + self.eps
|
||||
loss = 1 - 2.0 * intersection / union
|
||||
assert loss <= 1
|
||||
return loss
|
||||
|
||||
|
||||
class MaskL1Loss(nn.Layer):
|
||||
def __init__(self, eps=1e-6):
|
||||
super(MaskL1Loss, self).__init__()
|
||||
self.eps = eps
|
||||
|
||||
def forward(self, pred: paddle.Tensor, gt, mask):
|
||||
loss = (paddle.abs(pred - gt) * mask).sum() / (mask.sum() + self.eps)
|
||||
return loss
|
||||
39
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/model.py
Normal file
39
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/model.py
Normal file
@ -0,0 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:57
|
||||
# @Author : zhoujun
|
||||
from addict import Dict
|
||||
from paddle import nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from models.backbone import build_backbone
|
||||
from models.neck import build_neck
|
||||
from models.head import build_head
|
||||
|
||||
|
||||
class Model(nn.Layer):
|
||||
def __init__(self, model_config: dict):
|
||||
"""
|
||||
PANnet
|
||||
:param model_config: 模型配置
|
||||
"""
|
||||
super().__init__()
|
||||
model_config = Dict(model_config)
|
||||
backbone_type = model_config.backbone.pop("type")
|
||||
neck_type = model_config.neck.pop("type")
|
||||
head_type = model_config.head.pop("type")
|
||||
self.backbone = build_backbone(backbone_type, **model_config.backbone)
|
||||
self.neck = build_neck(
|
||||
neck_type, in_channels=self.backbone.out_channels, **model_config.neck
|
||||
)
|
||||
self.head = build_head(
|
||||
head_type, in_channels=self.neck.out_channels, **model_config.head
|
||||
)
|
||||
self.name = f"{backbone_type}_{neck_type}_{head_type}"
|
||||
|
||||
def forward(self, x):
|
||||
_, _, H, W = x.shape
|
||||
backbone_out = self.backbone(x)
|
||||
neck_out = self.neck(backbone_out)
|
||||
y = self.head(neck_out)
|
||||
y = F.interpolate(y, size=(H, W), mode="bilinear", align_corners=True)
|
||||
return y
|
||||
75
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/neck/FPN.py
Normal file
75
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/models/neck/FPN.py
Normal file
@ -0,0 +1,75 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/9/13 10:29
|
||||
# @Author : zhoujun
|
||||
import paddle
|
||||
import paddle.nn.functional as F
|
||||
from paddle import nn
|
||||
|
||||
from models.basic import ConvBnRelu
|
||||
|
||||
|
||||
class FPN(nn.Layer):
|
||||
def __init__(self, in_channels, inner_channels=256, **kwargs):
|
||||
"""
|
||||
:param in_channels: 基础网络输出的维度
|
||||
:param kwargs:
|
||||
"""
|
||||
super().__init__()
|
||||
inplace = True
|
||||
self.conv_out = inner_channels
|
||||
inner_channels = inner_channels // 4
|
||||
# reduce layers
|
||||
self.reduce_conv_c2 = ConvBnRelu(
|
||||
in_channels[0], inner_channels, kernel_size=1, inplace=inplace
|
||||
)
|
||||
self.reduce_conv_c3 = ConvBnRelu(
|
||||
in_channels[1], inner_channels, kernel_size=1, inplace=inplace
|
||||
)
|
||||
self.reduce_conv_c4 = ConvBnRelu(
|
||||
in_channels[2], inner_channels, kernel_size=1, inplace=inplace
|
||||
)
|
||||
self.reduce_conv_c5 = ConvBnRelu(
|
||||
in_channels[3], inner_channels, kernel_size=1, inplace=inplace
|
||||
)
|
||||
# Smooth layers
|
||||
self.smooth_p4 = ConvBnRelu(
|
||||
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
|
||||
)
|
||||
self.smooth_p3 = ConvBnRelu(
|
||||
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
|
||||
)
|
||||
self.smooth_p2 = ConvBnRelu(
|
||||
inner_channels, inner_channels, kernel_size=3, padding=1, inplace=inplace
|
||||
)
|
||||
|
||||
self.conv = nn.Sequential(
|
||||
nn.Conv2D(self.conv_out, self.conv_out, kernel_size=3, padding=1, stride=1),
|
||||
nn.BatchNorm2D(self.conv_out),
|
||||
nn.ReLU(),
|
||||
)
|
||||
self.out_channels = self.conv_out
|
||||
|
||||
def forward(self, x):
|
||||
c2, c3, c4, c5 = x
|
||||
# Top-down
|
||||
p5 = self.reduce_conv_c5(c5)
|
||||
p4 = self._upsample_add(p5, self.reduce_conv_c4(c4))
|
||||
p4 = self.smooth_p4(p4)
|
||||
p3 = self._upsample_add(p4, self.reduce_conv_c3(c3))
|
||||
p3 = self.smooth_p3(p3)
|
||||
p2 = self._upsample_add(p3, self.reduce_conv_c2(c2))
|
||||
p2 = self.smooth_p2(p2)
|
||||
|
||||
x = self._upsample_cat(p2, p3, p4, p5)
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
def _upsample_add(self, x, y):
|
||||
return F.interpolate(x, size=y.shape[2:]) + y
|
||||
|
||||
def _upsample_cat(self, p2, p3, p4, p5):
|
||||
h, w = p2.shape[2:]
|
||||
p3 = F.interpolate(p3, size=(h, w))
|
||||
p4 = F.interpolate(p4, size=(h, w))
|
||||
p5 = F.interpolate(p5, size=(h, w))
|
||||
return paddle.concat([p2, p3, p4, p5], axis=1)
|
||||
@ -0,0 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2020/6/5 11:34
|
||||
# @Author : zhoujun
|
||||
from .FPN import FPN
|
||||
|
||||
__all__ = ["build_neck"]
|
||||
support_neck = ["FPN"]
|
||||
|
||||
|
||||
def build_neck(neck_name, **kwargs):
|
||||
assert neck_name in support_neck, f"all support neck is {support_neck}"
|
||||
neck = eval(neck_name)(**kwargs)
|
||||
return neck
|
||||
@ -0,0 +1,2 @@
|
||||
# export NCCL_P2P_DISABLE=1
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m paddle.distributed.launch tools/train.py --config_file "config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml"
|
||||
@ -0,0 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/5 15:17
|
||||
# @Author : zhoujun
|
||||
|
||||
from .seg_detector_representer import SegDetectorRepresenter
|
||||
|
||||
|
||||
def get_post_processing(config):
|
||||
try:
|
||||
cls = eval(config["type"])(**config["args"])
|
||||
return cls
|
||||
except:
|
||||
return None
|
||||
@ -0,0 +1,191 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
import pyclipper
|
||||
import paddle
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
|
||||
class SegDetectorRepresenter:
|
||||
def __init__(
|
||||
self, thresh=0.3, box_thresh=0.7, max_candidates=1000, unclip_ratio=1.5
|
||||
):
|
||||
self.min_size = 3
|
||||
self.thresh = thresh
|
||||
self.box_thresh = box_thresh
|
||||
self.max_candidates = max_candidates
|
||||
self.unclip_ratio = unclip_ratio
|
||||
|
||||
def __call__(self, batch, pred, is_output_polygon=False):
|
||||
"""
|
||||
batch: (image, polygons, ignore_tags
|
||||
batch: a dict produced by dataloaders.
|
||||
image: tensor of shape (N, C, H, W).
|
||||
polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions.
|
||||
ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not.
|
||||
shape: the original shape of images.
|
||||
filename: the original filenames of images.
|
||||
pred:
|
||||
binary: text region segmentation map, with shape (N, H, W)
|
||||
thresh: [if exists] thresh hold prediction with shape (N, H, W)
|
||||
thresh_binary: [if exists] binarized with threshold, (N, H, W)
|
||||
"""
|
||||
if isinstance(pred, paddle.Tensor):
|
||||
pred = pred.numpy()
|
||||
pred = pred[:, 0, :, :]
|
||||
segmentation = self.binarize(pred)
|
||||
boxes_batch = []
|
||||
scores_batch = []
|
||||
for batch_index in range(pred.shape[0]):
|
||||
height, width = batch["shape"][batch_index]
|
||||
if is_output_polygon:
|
||||
boxes, scores = self.polygons_from_bitmap(
|
||||
pred[batch_index], segmentation[batch_index], width, height
|
||||
)
|
||||
else:
|
||||
boxes, scores = self.boxes_from_bitmap(
|
||||
pred[batch_index], segmentation[batch_index], width, height
|
||||
)
|
||||
boxes_batch.append(boxes)
|
||||
scores_batch.append(scores)
|
||||
return boxes_batch, scores_batch
|
||||
|
||||
def binarize(self, pred):
|
||||
return pred > self.thresh
|
||||
|
||||
def polygons_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
|
||||
"""
|
||||
_bitmap: single map with shape (H, W),
|
||||
whose values are binarized as {0, 1}
|
||||
"""
|
||||
|
||||
assert len(_bitmap.shape) == 2
|
||||
bitmap = _bitmap # The first channel
|
||||
height, width = bitmap.shape
|
||||
boxes = []
|
||||
scores = []
|
||||
|
||||
contours, _ = cv2.findContours(
|
||||
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
|
||||
for contour in contours[: self.max_candidates]:
|
||||
epsilon = 0.005 * cv2.arcLength(contour, True)
|
||||
approx = cv2.approxPolyDP(contour, epsilon, True)
|
||||
points = approx.reshape((-1, 2))
|
||||
if points.shape[0] < 4:
|
||||
continue
|
||||
# _, sside = self.get_mini_boxes(contour)
|
||||
# if sside < self.min_size:
|
||||
# continue
|
||||
score = self.box_score_fast(pred, contour.squeeze(1))
|
||||
if self.box_thresh > score:
|
||||
continue
|
||||
|
||||
if points.shape[0] > 2:
|
||||
box = self.unclip(points, unclip_ratio=self.unclip_ratio)
|
||||
if len(box) > 1:
|
||||
continue
|
||||
else:
|
||||
continue
|
||||
box = box.reshape(-1, 2)
|
||||
_, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
|
||||
if sside < self.min_size + 2:
|
||||
continue
|
||||
|
||||
if not isinstance(dest_width, int):
|
||||
dest_width = dest_width.item()
|
||||
dest_height = dest_height.item()
|
||||
|
||||
box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
|
||||
box[:, 1] = np.clip(
|
||||
np.round(box[:, 1] / height * dest_height), 0, dest_height
|
||||
)
|
||||
boxes.append(box)
|
||||
scores.append(score)
|
||||
return boxes, scores
|
||||
|
||||
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
|
||||
"""
|
||||
_bitmap: single map with shape (H, W),
|
||||
whose values are binarized as {0, 1}
|
||||
"""
|
||||
|
||||
assert len(_bitmap.shape) == 2
|
||||
bitmap = _bitmap # The first channel
|
||||
height, width = bitmap.shape
|
||||
contours, _ = cv2.findContours(
|
||||
(bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE
|
||||
)
|
||||
num_contours = min(len(contours), self.max_candidates)
|
||||
boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
|
||||
scores = np.zeros((num_contours,), dtype=np.float32)
|
||||
|
||||
for index in range(num_contours):
|
||||
contour = contours[index].squeeze(1)
|
||||
points, sside = self.get_mini_boxes(contour)
|
||||
if sside < self.min_size:
|
||||
continue
|
||||
points = np.array(points)
|
||||
score = self.box_score_fast(pred, contour)
|
||||
if self.box_thresh > score:
|
||||
continue
|
||||
|
||||
box = self.unclip(points, unclip_ratio=self.unclip_ratio).reshape(-1, 1, 2)
|
||||
box, sside = self.get_mini_boxes(box)
|
||||
if sside < self.min_size + 2:
|
||||
continue
|
||||
box = np.array(box)
|
||||
if not isinstance(dest_width, int):
|
||||
dest_width = dest_width.item()
|
||||
dest_height = dest_height.item()
|
||||
|
||||
box[:, 0] = np.clip(np.round(box[:, 0] / width * dest_width), 0, dest_width)
|
||||
box[:, 1] = np.clip(
|
||||
np.round(box[:, 1] / height * dest_height), 0, dest_height
|
||||
)
|
||||
boxes[index, :, :] = box.astype(np.int16)
|
||||
scores[index] = score
|
||||
return boxes, scores
|
||||
|
||||
def unclip(self, box, unclip_ratio=1.5):
|
||||
poly = Polygon(box)
|
||||
distance = poly.area * unclip_ratio / poly.length
|
||||
offset = pyclipper.PyclipperOffset()
|
||||
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
|
||||
expanded = np.array(offset.Execute(distance))
|
||||
return expanded
|
||||
|
||||
def get_mini_boxes(self, contour):
|
||||
bounding_box = cv2.minAreaRect(contour)
|
||||
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
|
||||
|
||||
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
|
||||
if points[1][1] > points[0][1]:
|
||||
index_1 = 0
|
||||
index_4 = 1
|
||||
else:
|
||||
index_1 = 1
|
||||
index_4 = 0
|
||||
if points[3][1] > points[2][1]:
|
||||
index_2 = 2
|
||||
index_3 = 3
|
||||
else:
|
||||
index_2 = 3
|
||||
index_3 = 2
|
||||
|
||||
box = [points[index_1], points[index_2], points[index_3], points[index_4]]
|
||||
return box, min(bounding_box[1])
|
||||
|
||||
def box_score_fast(self, bitmap, _box):
|
||||
h, w = bitmap.shape[:2]
|
||||
box = _box.copy()
|
||||
xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
|
||||
xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
|
||||
ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
|
||||
ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
|
||||
|
||||
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
|
||||
box[:, 0] = box[:, 0] - xmin
|
||||
box[:, 1] = box[:, 1] - ymin
|
||||
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
|
||||
return cv2.mean(bitmap[ymin : ymax + 1, xmin : xmax + 1], mask)[0]
|
||||
1
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/predict.sh
Normal file
1
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/predict.sh
Normal file
@ -0,0 +1 @@
|
||||
CUDA_VISIBLE_DEVICES=0 python tools/predict.py --model_path model_best.pth --input_folder ./input --output_folder ./output --thre 0.7 --polygon --show --save_result
|
||||
14
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/requirement.txt
Normal file
14
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/requirement.txt
Normal file
@ -0,0 +1,14 @@
|
||||
anyconfig
|
||||
future
|
||||
imgaug
|
||||
matplotlib
|
||||
numpy
|
||||
opencv-python
|
||||
Polygon3
|
||||
pyclipper
|
||||
PyYAML
|
||||
scikit-image
|
||||
Shapely
|
||||
tqdm
|
||||
addict
|
||||
visualdl
|
||||
@ -0,0 +1 @@
|
||||
CUDA_VISIBLE_DEVICES=0 python3 tools/train.py --config_file "config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml"
|
||||
8
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/test/README.MD
Normal file
8
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/test/README.MD
Normal file
@ -0,0 +1,8 @@
|
||||
Place the images that you want to detect here. You better named them as such:
|
||||
img_10.jpg
|
||||
img_11.jpg
|
||||
img_{img_id}.jpg
|
||||
|
||||
For predicting single images, you can change the `img_path` in the `/tools/predict.py` to your image number.
|
||||
|
||||
The result will be saved in the output_folder(default is test/output) you give in predict.sh
|
||||
@ -0,0 +1,287 @@
|
||||
#!/bin/bash
|
||||
source test_tipc/common_func.sh
|
||||
|
||||
# run benchmark sh
|
||||
# Usage:
|
||||
# bash run_benchmark_train.sh config.txt params
|
||||
# or
|
||||
# bash run_benchmark_train.sh config.txt
|
||||
|
||||
function func_parser_params(){
|
||||
strs=$1
|
||||
IFS="="
|
||||
array=(${strs})
|
||||
tmp=${array[1]}
|
||||
echo ${tmp}
|
||||
}
|
||||
|
||||
function set_dynamic_epoch(){
|
||||
string=$1
|
||||
num=$2
|
||||
_str=${string:1:6}
|
||||
IFS="C"
|
||||
arr=(${_str})
|
||||
M=${arr[0]}
|
||||
P=${arr[1]}
|
||||
ep=`expr $num \* $M \* $P`
|
||||
echo $ep
|
||||
}
|
||||
|
||||
function func_sed_params(){
|
||||
filename=$1
|
||||
line=$2
|
||||
param_value=$3
|
||||
params=`sed -n "${line}p" $filename`
|
||||
IFS=":"
|
||||
array=(${params})
|
||||
key=${array[0]}
|
||||
value=${array[1]}
|
||||
|
||||
new_params="${key}:${param_value}"
|
||||
IFS=";"
|
||||
cmd="sed -i '${line}s/.*/${new_params}/' '${filename}'"
|
||||
eval $cmd
|
||||
}
|
||||
|
||||
function set_gpu_id(){
|
||||
string=$1
|
||||
_str=${string:1:6}
|
||||
IFS="C"
|
||||
arr=(${_str})
|
||||
M=${arr[0]}
|
||||
P=${arr[1]}
|
||||
gn=`expr $P - 1`
|
||||
gpu_num=`expr $gn / $M`
|
||||
seq=`seq -s "," 0 $gpu_num`
|
||||
echo $seq
|
||||
}
|
||||
|
||||
function get_repo_name(){
|
||||
IFS=";"
|
||||
cur_dir=$(pwd)
|
||||
IFS="/"
|
||||
arr=(${cur_dir})
|
||||
echo ${arr[-1]}
|
||||
}
|
||||
|
||||
FILENAME=$1
|
||||
# copy FILENAME as new
|
||||
new_filename="./test_tipc/benchmark_train.txt"
|
||||
cmd=`yes|cp $FILENAME $new_filename`
|
||||
FILENAME=$new_filename
|
||||
# MODE must be one of ['benchmark_train']
|
||||
MODE=$2
|
||||
PARAMS=$3
|
||||
|
||||
to_static=""
|
||||
# parse "to_static" options and modify trainer into "to_static_trainer"
|
||||
if [[ $PARAMS =~ "dynamicTostatic" ]] ;then
|
||||
to_static="d2sT_"
|
||||
sed -i 's/trainer:norm_train/trainer:to_static_train/g' $FILENAME
|
||||
# clear PARAM contents
|
||||
if [ $PARAMS = "to_static" ] ;then
|
||||
PARAMS=""
|
||||
fi
|
||||
fi
|
||||
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_fp32_DP_N1C8
|
||||
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamicTostatic_bs8_fp32_DP_N1C8
|
||||
# bash test_tipc/benchmark_train.sh test_tipc/configs/det_mv3_db_v2_0/train_benchmark.txt benchmark_train dynamic_bs8_null_DP_N1C1
|
||||
IFS=$'\n'
|
||||
# parser params from train_benchmark.txt
|
||||
dataline=`cat $FILENAME`
|
||||
# parser params
|
||||
IFS=$'\n'
|
||||
lines=(${dataline})
|
||||
model_name=$(func_parser_value "${lines[1]}")
|
||||
python_name=$(func_parser_value "${lines[2]}")
|
||||
|
||||
# set env
|
||||
python=${python_name}
|
||||
export str_tmp=$(echo `pip list|grep paddlepaddle-gpu|awk -F ' ' '{print $2}'`)
|
||||
export frame_version=${str_tmp%%.post*}
|
||||
export frame_commit=$(echo `${python} -c "import paddle;print(paddle.version.commit)"`)
|
||||
|
||||
# 获取benchmark_params所在的行数
|
||||
line_num=`grep -n -w "train_benchmark_params" $FILENAME | cut -d ":" -f 1`
|
||||
# for train log parser
|
||||
batch_size=$(func_parser_value "${lines[line_num]}")
|
||||
line_num=`expr $line_num + 1`
|
||||
fp_items=$(func_parser_value "${lines[line_num]}")
|
||||
line_num=`expr $line_num + 1`
|
||||
epoch=$(func_parser_value "${lines[line_num]}")
|
||||
|
||||
line_num=`expr $line_num + 1`
|
||||
profile_option_key=$(func_parser_key "${lines[line_num]}")
|
||||
profile_option_params=$(func_parser_value "${lines[line_num]}")
|
||||
profile_option="${profile_option_key}:${profile_option_params}"
|
||||
|
||||
line_num=`expr $line_num + 1`
|
||||
flags_value=$(func_parser_value "${lines[line_num]}")
|
||||
# set flags
|
||||
IFS=";"
|
||||
flags_list=(${flags_value})
|
||||
for _flag in ${flags_list[*]}; do
|
||||
cmd="export ${_flag}"
|
||||
eval $cmd
|
||||
done
|
||||
|
||||
# set log_name
|
||||
repo_name=$(get_repo_name )
|
||||
SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)} # */benchmark_log
|
||||
mkdir -p "${SAVE_LOG}/benchmark_log/"
|
||||
status_log="${SAVE_LOG}/benchmark_log/results.log"
|
||||
|
||||
# The number of lines in which train params can be replaced.
|
||||
line_python=3
|
||||
line_gpuid=4
|
||||
line_precision=6
|
||||
line_epoch=7
|
||||
line_batchsize=9
|
||||
line_profile=13
|
||||
line_eval_py=24
|
||||
line_export_py=30
|
||||
|
||||
func_sed_params "$FILENAME" "${line_eval_py}" "null"
|
||||
func_sed_params "$FILENAME" "${line_export_py}" "null"
|
||||
func_sed_params "$FILENAME" "${line_python}" "$python"
|
||||
|
||||
# if params
|
||||
if [ ! -n "$PARAMS" ] ;then
|
||||
# PARAMS input is not a word.
|
||||
IFS="|"
|
||||
batch_size_list=(${batch_size})
|
||||
fp_items_list=(${fp_items})
|
||||
device_num_list=(N1C4)
|
||||
run_mode="DP"
|
||||
elif [[ ${PARAMS} = "dynamicTostatic" ]];then
|
||||
IFS="|"
|
||||
model_type=$PARAMS
|
||||
batch_size_list=(${batch_size})
|
||||
fp_items_list=(${fp_items})
|
||||
device_num_list=(N1C4)
|
||||
run_mode="DP"
|
||||
else
|
||||
# parser params from input: modeltype_bs${bs_item}_${fp_item}_${run_mode}_${device_num}
|
||||
IFS="_"
|
||||
params_list=(${PARAMS})
|
||||
model_type=${params_list[0]}
|
||||
batch_size=${params_list[1]}
|
||||
batch_size=`echo ${batch_size} | tr -cd "[0-9]" `
|
||||
precision=${params_list[2]}
|
||||
run_mode=${params_list[3]}
|
||||
device_num=${params_list[4]}
|
||||
IFS=";"
|
||||
|
||||
if [ ${precision} = "fp16" ];then
|
||||
precision="amp"
|
||||
fi
|
||||
|
||||
epoch=$(set_dynamic_epoch $device_num $epoch)
|
||||
fp_items_list=($precision)
|
||||
batch_size_list=($batch_size)
|
||||
device_num_list=($device_num)
|
||||
fi
|
||||
|
||||
IFS="|"
|
||||
for batch_size in ${batch_size_list[*]}; do
|
||||
for train_precision in ${fp_items_list[*]}; do
|
||||
for device_num in ${device_num_list[*]}; do
|
||||
# sed batchsize and precision
|
||||
if [ ${train_precision} = "amp" ];then
|
||||
precision="fp16"
|
||||
else
|
||||
precision="fp32"
|
||||
fi
|
||||
|
||||
func_sed_params "$FILENAME" "${line_precision}" "$train_precision"
|
||||
func_sed_params "$FILENAME" "${line_batchsize}" "$MODE=$batch_size"
|
||||
func_sed_params "$FILENAME" "${line_epoch}" "$MODE=$epoch"
|
||||
gpu_id=$(set_gpu_id $device_num)
|
||||
|
||||
if [ ${#gpu_id} -le 1 ];then
|
||||
log_path="$SAVE_LOG/profiling_log"
|
||||
mkdir -p $log_path
|
||||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling"
|
||||
func_sed_params "$FILENAME" "${line_gpuid}" "0" # sed used gpu_id
|
||||
# set profile_option params
|
||||
tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
|
||||
|
||||
# run test_train_inference_python.sh
|
||||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
|
||||
echo $cmd
|
||||
eval $cmd
|
||||
eval "cat ${log_path}/${log_name}"
|
||||
|
||||
# without profile
|
||||
log_path="$SAVE_LOG/train_log"
|
||||
speed_log_path="$SAVE_LOG/index"
|
||||
mkdir -p $log_path
|
||||
mkdir -p $speed_log_path
|
||||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
|
||||
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
|
||||
func_sed_params "$FILENAME" "${line_profile}" "null" # sed profile_id as null
|
||||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
|
||||
echo $cmd
|
||||
job_bt=`date '+%Y%m%d%H%M%S'`
|
||||
eval $cmd
|
||||
job_et=`date '+%Y%m%d%H%M%S'`
|
||||
export model_run_time=$((${job_et}-${job_bt}))
|
||||
eval "cat ${log_path}/${log_name}"
|
||||
|
||||
# parser log
|
||||
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
|
||||
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
|
||||
--speed_log_file '${speed_log_path}/${speed_log_name}' \
|
||||
--model_name ${_model_name} \
|
||||
--base_batch_size ${batch_size} \
|
||||
--run_mode ${run_mode} \
|
||||
--fp_item ${precision} \
|
||||
--keyword ips: \
|
||||
--skip_steps 2 \
|
||||
--device_num ${device_num} \
|
||||
--speed_unit samples/s \
|
||||
--convergence_key loss: "
|
||||
echo $cmd
|
||||
eval $cmd
|
||||
last_status=${PIPESTATUS[0]}
|
||||
status_check $last_status "${cmd}" "${status_log}"
|
||||
else
|
||||
IFS=";"
|
||||
unset_env=`unset CUDA_VISIBLE_DEVICES`
|
||||
log_path="$SAVE_LOG/train_log"
|
||||
speed_log_path="$SAVE_LOG/index"
|
||||
mkdir -p $log_path
|
||||
mkdir -p $speed_log_path
|
||||
log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}log"
|
||||
speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}speed"
|
||||
func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id" # sed used gpu_id
|
||||
func_sed_params "$FILENAME" "${line_profile}" "null" # sed --profile_option as null
|
||||
cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
|
||||
echo $cmd
|
||||
job_bt=`date '+%Y%m%d%H%M%S'`
|
||||
eval $cmd
|
||||
job_et=`date '+%Y%m%d%H%M%S'`
|
||||
export model_run_time=$((${job_et}-${job_bt}))
|
||||
eval "cat ${log_path}/${log_name}"
|
||||
# parser log
|
||||
_model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
|
||||
|
||||
cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
|
||||
--speed_log_file '${speed_log_path}/${speed_log_name}' \
|
||||
--model_name ${_model_name} \
|
||||
--base_batch_size ${batch_size} \
|
||||
--run_mode ${run_mode} \
|
||||
--fp_item ${precision} \
|
||||
--keyword ips: \
|
||||
--skip_steps 2 \
|
||||
--device_num ${device_num} \
|
||||
--speed_unit images/s \
|
||||
--convergence_key loss: "
|
||||
echo $cmd
|
||||
eval $cmd
|
||||
last_status=${PIPESTATUS[0]}
|
||||
status_check $last_status "${cmd}" "${status_log}"
|
||||
fi
|
||||
done
|
||||
done
|
||||
done
|
||||
@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
|
||||
function func_parser_key(){
|
||||
strs=$1
|
||||
IFS=":"
|
||||
array=(${strs})
|
||||
tmp=${array[0]}
|
||||
echo ${tmp}
|
||||
}
|
||||
|
||||
function func_parser_value(){
|
||||
strs=$1
|
||||
IFS=":"
|
||||
array=(${strs})
|
||||
tmp=${array[1]}
|
||||
echo ${tmp}
|
||||
}
|
||||
|
||||
function func_set_params(){
|
||||
key=$1
|
||||
value=$2
|
||||
if [ ${key}x = "null"x ];then
|
||||
echo " "
|
||||
elif [[ ${value} = "null" ]] || [[ ${value} = " " ]] || [ ${#value} -le 0 ];then
|
||||
echo " "
|
||||
else
|
||||
echo "${key}=${value}"
|
||||
fi
|
||||
}
|
||||
|
||||
function func_parser_params(){
|
||||
strs=$1
|
||||
MODE=$2
|
||||
IFS=":"
|
||||
array=(${strs})
|
||||
key=${array[0]}
|
||||
tmp=${array[1]}
|
||||
IFS="|"
|
||||
res=""
|
||||
for _params in ${tmp[*]}; do
|
||||
IFS="="
|
||||
array=(${_params})
|
||||
mode=${array[0]}
|
||||
value=${array[1]}
|
||||
if [[ ${mode} = ${MODE} ]]; then
|
||||
IFS="|"
|
||||
#echo $(func_set_params "${mode}" "${value}")
|
||||
echo $value
|
||||
break
|
||||
fi
|
||||
IFS="|"
|
||||
done
|
||||
echo ${res}
|
||||
}
|
||||
|
||||
function status_check(){
|
||||
last_status=$1 # the exit code
|
||||
run_command=$2
|
||||
run_log=$3
|
||||
model_name=$4
|
||||
log_path=$5
|
||||
if [ $last_status -eq 0 ]; then
|
||||
echo -e "\033[33m Run successfully with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
|
||||
else
|
||||
echo -e "\033[33m Run failed with command - ${model_name} - ${run_command} - ${log_path} \033[0m" | tee -a ${run_log}
|
||||
fi
|
||||
}
|
||||
@ -0,0 +1,61 @@
|
||||
===========================train_params===========================
|
||||
model_name:det_res50_db
|
||||
python:python
|
||||
gpu_list:0|0,1
|
||||
trainer.use_gpu:True|True
|
||||
amp:null
|
||||
trainer.epochs:lite_train_lite_infer=1|whole_train_whole_infer=300
|
||||
trainer.output_dir:./output/
|
||||
dataset.train.loader.batch_size:lite_train_lite_infer=8|whole_train_lite_infer=8
|
||||
trainer.finetune_checkpoint:null
|
||||
train_model_name:checkpoint/model_latest.pth
|
||||
train_infer_img_dir:imgs/paper/db.jpg
|
||||
null:null
|
||||
##
|
||||
trainer:norm_train
|
||||
norm_train:tools/train.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o trainer.log_iter=1 trainer.enable_eval=False dataset.train.loader.shuffle=false arch.backbone.pretrained=False
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_train:null
|
||||
null:null
|
||||
null:null
|
||||
##
|
||||
===========================eval_params===========================
|
||||
eval:null
|
||||
null:null
|
||||
##
|
||||
===========================infer_params===========================
|
||||
trainer.output_dir:./output/
|
||||
trainer.resume_checkpoint:
|
||||
norm_export:tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o
|
||||
quant_export:null
|
||||
fpgm_export:null
|
||||
distill_export:null
|
||||
export1:null
|
||||
export2:null
|
||||
##
|
||||
train_model:./inference/det_r50_vd_db_v2.0_train/best_accuracy
|
||||
infer_export:tools/export_model.py --config_file config/icdar2015_resnet50_FPN_DBhead_polyLR.yaml -o
|
||||
infer_quant:False
|
||||
inference:tools/infer.py
|
||||
--use_gpu:True|False
|
||||
--enable_mkldnn:False
|
||||
--cpu_threads:6
|
||||
--batch_size:1
|
||||
--use_tensorrt:False
|
||||
--precision:fp32
|
||||
--model_dir:
|
||||
--img_path:imgs/paper/db.jpg
|
||||
--save_log_path:null
|
||||
--benchmark:True
|
||||
null:null
|
||||
===========================infer_benchmark_params==========================
|
||||
random_infer_input:[{float32,[3,640,640]}];[{float32,[3,960,960]}]
|
||||
===========================train_benchmark_params==========================
|
||||
batch_size:8
|
||||
fp_items:fp32|fp16
|
||||
epoch:2
|
||||
--profiler_options:batch_range=[10,20];state=GPU;tracer_option=Default;profile_path=model.profile
|
||||
flags:FLAGS_eager_delete_tensor_gb=0.0;FLAGS_fraction_of_gpu_memory_to_use=0.98;FLAGS_conv_workspace_size_limit=4096
|
||||
===========================to_static_train_benchmark_params===========================
|
||||
to_static_train:trainer.to_static=true
|
||||
@ -0,0 +1,54 @@
|
||||
#!/bin/bash
|
||||
source test_tipc/common_func.sh
|
||||
|
||||
FILENAME=$1
|
||||
|
||||
# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer',
|
||||
# 'whole_infer', 'klquant_whole_infer',
|
||||
# 'cpp_infer', 'serving_infer']
|
||||
|
||||
MODE=$2
|
||||
|
||||
dataline=$(cat ${FILENAME})
|
||||
|
||||
# parser params
|
||||
IFS=$'\n'
|
||||
lines=(${dataline})
|
||||
|
||||
# The training params
|
||||
model_name=$(func_parser_value "${lines[1]}")
|
||||
|
||||
trainer_list=$(func_parser_value "${lines[14]}")
|
||||
|
||||
if [ ${MODE} = "lite_train_lite_infer" ];then
|
||||
python_name_list=$(func_parser_value "${lines[2]}")
|
||||
array=(${python_name_list})
|
||||
python_name=${array[0]}
|
||||
${python_name} -m pip install -r requirement.txt
|
||||
if [[ ${model_name} =~ "det_res50_db" ]];then
|
||||
wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/resnet50-19c8e357.pth -O /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth
|
||||
|
||||
# 下载数据集并解压
|
||||
rm -rf datasets
|
||||
wget -nc https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/benchmark_train/datasets.tar
|
||||
tar xf datasets.tar
|
||||
fi
|
||||
elif [ ${MODE} = "benchmark_train" ];then
|
||||
python_name_list=$(func_parser_value "${lines[2]}")
|
||||
array=(${python_name_list})
|
||||
python_name=${array[0]}
|
||||
${python_name} -m pip install -r requirement.txt
|
||||
if [[ ${model_name} =~ "det_res50_db" ]];then
|
||||
wget -nc https://paddle-wheel.bj.bcebos.com/benchmark/resnet50-19c8e357.pth -O /root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth
|
||||
|
||||
# 下载数据集并解压
|
||||
rm -rf datasets
|
||||
wget -nc https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/benchmark_train/datasets.tar
|
||||
tar xf datasets.tar
|
||||
# expand gt.txt 2 times
|
||||
# cd ./train_data/icdar2015/text_localization
|
||||
# for i in `seq 2`;do cp train_icdar2015_label.txt dup$i.txt;done
|
||||
# cat dup* > train_icdar2015_label.txt && rm -rf dup*
|
||||
# cd ../../../
|
||||
fi
|
||||
fi
|
||||
@ -0,0 +1,343 @@
|
||||
#!/bin/bash
|
||||
source test_tipc/common_func.sh
|
||||
|
||||
FILENAME=$1
|
||||
# MODE be one of ['lite_train_lite_infer' 'lite_train_whole_infer' 'whole_train_whole_infer', 'whole_infer']
|
||||
MODE=$2
|
||||
|
||||
dataline=$(awk 'NR>=1{print}' $FILENAME)
|
||||
|
||||
# parser params
|
||||
IFS=$'\n'
|
||||
lines=(${dataline})
|
||||
|
||||
# The training params
|
||||
model_name=$(func_parser_value "${lines[1]}")
|
||||
python=$(func_parser_value "${lines[2]}")
|
||||
gpu_list=$(func_parser_value "${lines[3]}")
|
||||
train_use_gpu_key=$(func_parser_key "${lines[4]}")
|
||||
train_use_gpu_value=$(func_parser_value "${lines[4]}")
|
||||
autocast_list=$(func_parser_value "${lines[5]}")
|
||||
autocast_key=$(func_parser_key "${lines[5]}")
|
||||
epoch_key=$(func_parser_key "${lines[6]}")
|
||||
epoch_num=$(func_parser_params "${lines[6]}" "${MODE}")
|
||||
save_model_key=$(func_parser_key "${lines[7]}")
|
||||
train_batch_key=$(func_parser_key "${lines[8]}")
|
||||
train_batch_value=$(func_parser_params "${lines[8]}" "${MODE}")
|
||||
pretrain_model_key=$(func_parser_key "${lines[9]}")
|
||||
pretrain_model_value=$(func_parser_value "${lines[9]}")
|
||||
train_model_name=$(func_parser_value "${lines[10]}")
|
||||
train_infer_img_dir=$(func_parser_value "${lines[11]}")
|
||||
train_param_key1=$(func_parser_key "${lines[12]}")
|
||||
train_param_value1=$(func_parser_value "${lines[12]}")
|
||||
|
||||
trainer_list=$(func_parser_value "${lines[14]}")
|
||||
trainer_norm=$(func_parser_key "${lines[15]}")
|
||||
norm_trainer=$(func_parser_value "${lines[15]}")
|
||||
pact_key=$(func_parser_key "${lines[16]}")
|
||||
pact_trainer=$(func_parser_value "${lines[16]}")
|
||||
fpgm_key=$(func_parser_key "${lines[17]}")
|
||||
fpgm_trainer=$(func_parser_value "${lines[17]}")
|
||||
distill_key=$(func_parser_key "${lines[18]}")
|
||||
distill_trainer=$(func_parser_value "${lines[18]}")
|
||||
trainer_key1=$(func_parser_key "${lines[19]}")
|
||||
trainer_value1=$(func_parser_value "${lines[19]}")
|
||||
trainer_key2=$(func_parser_key "${lines[20]}")
|
||||
trainer_value2=$(func_parser_value "${lines[20]}")
|
||||
|
||||
eval_py=$(func_parser_value "${lines[23]}")
|
||||
eval_key1=$(func_parser_key "${lines[24]}")
|
||||
eval_value1=$(func_parser_value "${lines[24]}")
|
||||
|
||||
save_infer_key=$(func_parser_key "${lines[27]}")
|
||||
export_weight=$(func_parser_key "${lines[28]}")
|
||||
norm_export=$(func_parser_value "${lines[29]}")
|
||||
pact_export=$(func_parser_value "${lines[30]}")
|
||||
fpgm_export=$(func_parser_value "${lines[31]}")
|
||||
distill_export=$(func_parser_value "${lines[32]}")
|
||||
export_key1=$(func_parser_key "${lines[33]}")
|
||||
export_value1=$(func_parser_value "${lines[33]}")
|
||||
export_key2=$(func_parser_key "${lines[34]}")
|
||||
export_value2=$(func_parser_value "${lines[34]}")
|
||||
inference_dir=$(func_parser_value "${lines[35]}")
|
||||
|
||||
# parser inference model
|
||||
infer_model_dir_list=$(func_parser_value "${lines[36]}")
|
||||
infer_export_list=$(func_parser_value "${lines[37]}")
|
||||
infer_is_quant=$(func_parser_value "${lines[38]}")
|
||||
# parser inference
|
||||
inference_py=$(func_parser_value "${lines[39]}")
|
||||
use_gpu_key=$(func_parser_key "${lines[40]}")
|
||||
use_gpu_list=$(func_parser_value "${lines[40]}")
|
||||
use_mkldnn_key=$(func_parser_key "${lines[41]}")
|
||||
use_mkldnn_list=$(func_parser_value "${lines[41]}")
|
||||
cpu_threads_key=$(func_parser_key "${lines[42]}")
|
||||
cpu_threads_list=$(func_parser_value "${lines[42]}")
|
||||
batch_size_key=$(func_parser_key "${lines[43]}")
|
||||
batch_size_list=$(func_parser_value "${lines[43]}")
|
||||
use_trt_key=$(func_parser_key "${lines[44]}")
|
||||
use_trt_list=$(func_parser_value "${lines[44]}")
|
||||
precision_key=$(func_parser_key "${lines[45]}")
|
||||
precision_list=$(func_parser_value "${lines[45]}")
|
||||
infer_model_key=$(func_parser_key "${lines[46]}")
|
||||
image_dir_key=$(func_parser_key "${lines[47]}")
|
||||
infer_img_dir=$(func_parser_value "${lines[47]}")
|
||||
save_log_key=$(func_parser_key "${lines[48]}")
|
||||
benchmark_key=$(func_parser_key "${lines[49]}")
|
||||
benchmark_value=$(func_parser_value "${lines[49]}")
|
||||
infer_key1=$(func_parser_key "${lines[50]}")
|
||||
infer_value1=$(func_parser_value "${lines[50]}")
|
||||
|
||||
LOG_PATH="./test_tipc/output/${model_name}/${MODE}"
|
||||
mkdir -p ${LOG_PATH}
|
||||
status_log="${LOG_PATH}/results_python.log"
|
||||
|
||||
line_num=`grep -n -w "to_static_train_benchmark_params" $FILENAME | cut -d ":" -f 1`
|
||||
to_static_key=$(func_parser_key "${lines[line_num]}")
|
||||
to_static_trainer=$(func_parser_value "${lines[line_num]}")
|
||||
|
||||
function func_inference(){
|
||||
IFS='|'
|
||||
_python=$1
|
||||
_script=$2
|
||||
_model_dir=$3
|
||||
_log_path=$4
|
||||
_img_dir=$5
|
||||
_flag_quant=$6
|
||||
_gpu=$7
|
||||
# inference
|
||||
for use_gpu in ${use_gpu_list[*]}; do
|
||||
if [ ${use_gpu} = "False" ] || [ ${use_gpu} = "cpu" ]; then
|
||||
for use_mkldnn in ${use_mkldnn_list[*]}; do
|
||||
# if [ ${use_mkldnn} = "False" ] && [ ${_flag_quant} = "True" ]; then
|
||||
# continue
|
||||
# fi
|
||||
for threads in ${cpu_threads_list[*]}; do
|
||||
for batch_size in ${batch_size_list[*]}; do
|
||||
for precision in ${precision_list[*]}; do
|
||||
if [ ${use_mkldnn} = "False" ] && [ ${precision} = "fp16" ]; then
|
||||
continue
|
||||
fi # skip when enable fp16 but disable mkldnn
|
||||
if [ ${_flag_quant} = "True" ] && [ ${precision} != "int8" ]; then
|
||||
continue
|
||||
fi # skip when quant model inference but precision is not int8
|
||||
set_precision=$(func_set_params "${precision_key}" "${precision}")
|
||||
|
||||
_save_log_path="${_log_path}/python_infer_cpu_gpus_${_gpu}_usemkldnn_${use_mkldnn}_threads_${threads}_precision_${precision}_batchsize_${batch_size}.log"
|
||||
set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
|
||||
set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
|
||||
set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
|
||||
set_mkldnn=$(func_set_params "${use_mkldnn_key}" "${use_mkldnn}")
|
||||
set_cpu_threads=$(func_set_params "${cpu_threads_key}" "${threads}")
|
||||
set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
|
||||
set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}")
|
||||
set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
|
||||
command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_mkldnn} ${set_cpu_threads} ${set_model_dir} ${set_batchsize} ${set_infer_params0} ${set_infer_data} ${set_benchmark} ${set_precision} ${set_infer_params1} > ${_save_log_path} 2>&1 "
|
||||
eval $command
|
||||
last_status=${PIPESTATUS[0]}
|
||||
eval "cat ${_save_log_path}"
|
||||
status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
|
||||
done
|
||||
done
|
||||
done
|
||||
done
|
||||
elif [ ${use_gpu} = "True" ] || [ ${use_gpu} = "gpu" ]; then
|
||||
for use_trt in ${use_trt_list[*]}; do
|
||||
for precision in ${precision_list[*]}; do
|
||||
if [[ ${_flag_quant} = "False" ]] && [[ ${precision} =~ "int8" ]]; then
|
||||
continue
|
||||
fi
|
||||
if [[ ${precision} =~ "fp16" || ${precision} =~ "int8" ]] && [ ${use_trt} = "False" ]; then
|
||||
continue
|
||||
fi
|
||||
if [[ ${use_trt} = "False" && ${precision} =~ "int8" ]] && [ ${_flag_quant} = "True" ]; then
|
||||
continue
|
||||
fi
|
||||
for batch_size in ${batch_size_list[*]}; do
|
||||
_save_log_path="${_log_path}/python_infer_gpu_gpus_${_gpu}_usetrt_${use_trt}_precision_${precision}_batchsize_${batch_size}.log"
|
||||
set_infer_data=$(func_set_params "${image_dir_key}" "${_img_dir}")
|
||||
set_benchmark=$(func_set_params "${benchmark_key}" "${benchmark_value}")
|
||||
set_batchsize=$(func_set_params "${batch_size_key}" "${batch_size}")
|
||||
set_tensorrt=$(func_set_params "${use_trt_key}" "${use_trt}")
|
||||
set_precision=$(func_set_params "${precision_key}" "${precision}")
|
||||
set_model_dir=$(func_set_params "${infer_model_key}" "${_model_dir}")
|
||||
set_infer_params0=$(func_set_params "${save_log_key}" "${save_log_value}")
|
||||
set_infer_params1=$(func_set_params "${infer_key1}" "${infer_value1}")
|
||||
command="${_python} ${_script} ${use_gpu_key}=${use_gpu} ${set_tensorrt} ${set_precision} ${set_model_dir} ${set_batchsize} ${set_infer_data} ${set_benchmark} ${set_infer_params1} ${set_infer_params0} > ${_save_log_path} 2>&1 "
|
||||
eval $command
|
||||
last_status=${PIPESTATUS[0]}
|
||||
eval "cat ${_save_log_path}"
|
||||
status_check $last_status "${command}" "${status_log}" "${model_name}" "${_save_log_path}"
|
||||
|
||||
done
|
||||
done
|
||||
done
|
||||
else
|
||||
echo "Does not support hardware other than CPU and GPU Currently!"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
if [ ${MODE} = "whole_infer" ]; then
|
||||
GPUID=$3
|
||||
if [ ${#GPUID} -le 0 ];then
|
||||
env=" "
|
||||
else
|
||||
env="export CUDA_VISIBLE_DEVICES=${GPUID}"
|
||||
fi
|
||||
# set CUDA_VISIBLE_DEVICES
|
||||
eval $env
|
||||
export Count=0
|
||||
gpu=0
|
||||
IFS="|"
|
||||
infer_run_exports=(${infer_export_list})
|
||||
infer_quant_flag=(${infer_is_quant})
|
||||
for infer_model in ${infer_model_dir_list[*]}; do
|
||||
# run export
|
||||
if [ ${infer_run_exports[Count]} != "null" ];then
|
||||
save_infer_dir="${infer_model}"
|
||||
set_export_weight=$(func_set_params "${export_weight}" "${infer_model}")
|
||||
set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_dir}")
|
||||
export_log_path="${LOG_PATH}_export_${Count}.log"
|
||||
export_cmd="${python} ${infer_run_exports[Count]} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 "
|
||||
echo ${infer_run_exports[Count]}
|
||||
echo $export_cmd
|
||||
eval $export_cmd
|
||||
status_export=$?
|
||||
status_check $status_export "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
|
||||
else
|
||||
save_infer_dir=${infer_model}
|
||||
fi
|
||||
#run inference
|
||||
is_quant=${infer_quant_flag[Count]}
|
||||
func_inference "${python}" "${inference_py}" "${save_infer_dir}" "${LOG_PATH}" "${infer_img_dir}" ${is_quant} "${gpu}"
|
||||
Count=$(($Count + 1))
|
||||
done
|
||||
else
|
||||
IFS="|"
|
||||
export Count=0
|
||||
USE_GPU_KEY=(${train_use_gpu_value})
|
||||
for gpu in ${gpu_list[*]}; do
|
||||
train_use_gpu=${USE_GPU_KEY[Count]}
|
||||
Count=$(($Count + 1))
|
||||
ips=""
|
||||
if [ ${gpu} = "-1" ];then
|
||||
env=""
|
||||
elif [ ${#gpu} -le 1 ];then
|
||||
env="export CUDA_VISIBLE_DEVICES=${gpu}"
|
||||
elif [ ${#gpu} -le 15 ];then
|
||||
IFS=","
|
||||
array=(${gpu})
|
||||
env="export CUDA_VISIBLE_DEVICES=${array[0]}"
|
||||
IFS="|"
|
||||
else
|
||||
IFS=";"
|
||||
array=(${gpu})
|
||||
ips=${array[0]}
|
||||
gpu=${array[1]}
|
||||
IFS="|"
|
||||
env=" "
|
||||
fi
|
||||
for autocast in ${autocast_list[*]}; do
|
||||
if [ ${autocast} = "amp" ]; then
|
||||
set_amp_config="amp.scale_loss=1024.0 amp.use_dynamic_loss_scaling=True amp.amp_level=O2"
|
||||
else
|
||||
set_amp_config="amp=None"
|
||||
fi
|
||||
for trainer in ${trainer_list[*]}; do
|
||||
flag_quant=False
|
||||
if [ ${trainer} = ${pact_key} ]; then
|
||||
run_train=${pact_trainer}
|
||||
run_export=${pact_export}
|
||||
flag_quant=True
|
||||
elif [ ${trainer} = "${fpgm_key}" ]; then
|
||||
run_train=${fpgm_trainer}
|
||||
run_export=${fpgm_export}
|
||||
elif [ ${trainer} = "${distill_key}" ]; then
|
||||
run_train=${distill_trainer}
|
||||
run_export=${distill_export}
|
||||
elif [ ${trainer} = "${to_static_key}" ]; then
|
||||
run_train="${norm_trainer} ${to_static_trainer}"
|
||||
run_export=${norm_export}
|
||||
elif [[ ${trainer} = ${trainer_key2} ]]; then
|
||||
run_train=${trainer_value2}
|
||||
run_export=${export_value2}
|
||||
else
|
||||
run_train=${norm_trainer}
|
||||
run_export=${norm_export}
|
||||
fi
|
||||
|
||||
if [ ${run_train} = "null" ]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
set_epoch=$(func_set_params "${epoch_key}" "${epoch_num}")
|
||||
set_pretrain=$(func_set_params "${pretrain_model_key}" "${pretrain_model_value}")
|
||||
set_batchsize=$(func_set_params "${train_batch_key}" "${train_batch_value}")
|
||||
set_train_params1=$(func_set_params "${train_param_key1}" "${train_param_value1}")
|
||||
set_use_gpu=$(func_set_params "${train_use_gpu_key}" "${train_use_gpu}")
|
||||
# if length of ips >= 15, then it is seen as multi-machine
|
||||
# 15 is the min length of ips info for multi-machine: 0.0.0.0,0.0.0.0
|
||||
if [ ${#ips} -le 15 ];then
|
||||
save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}"
|
||||
nodes=1
|
||||
else
|
||||
IFS=","
|
||||
ips_array=(${ips})
|
||||
IFS="|"
|
||||
nodes=${#ips_array[@]}
|
||||
save_log="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}"
|
||||
fi
|
||||
|
||||
|
||||
set_save_model=$(func_set_params "${save_model_key}" "${save_log}")
|
||||
if [ ${#gpu} -le 2 ];then # train with cpu or single gpu
|
||||
cmd="${python} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}"
|
||||
elif [ ${#ips} -le 15 ];then # train with multi-gpu
|
||||
cmd="${python} -m paddle.distributed.launch --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_epoch} ${set_pretrain} ${set_batchsize} ${set_amp_config} ${set_train_params1}"
|
||||
else # train with multi-machine
|
||||
cmd="${python} -m paddle.distributed.launch --ips=${ips} --gpus=${gpu} ${run_train} ${set_use_gpu} ${set_save_model} ${set_pretrain} ${set_epoch} ${set_batchsize} ${set_amp_config} ${set_train_params1}"
|
||||
fi
|
||||
# run train
|
||||
eval $cmd
|
||||
eval "cat ${save_log}/train.log >> ${save_log}.log"
|
||||
status_check $? "${cmd}" "${status_log}" "${model_name}" "${save_log}.log"
|
||||
|
||||
set_eval_pretrain=$(func_set_params "${pretrain_model_key}" "${save_log}/${train_model_name}")
|
||||
|
||||
# run eval
|
||||
if [ ${eval_py} != "null" ]; then
|
||||
eval ${env}
|
||||
set_eval_params1=$(func_set_params "${eval_key1}" "${eval_value1}")
|
||||
eval_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_eval.log"
|
||||
eval_cmd="${python} ${eval_py} ${set_eval_pretrain} ${set_use_gpu} ${set_eval_params1} > ${eval_log_path} 2>&1 "
|
||||
eval $eval_cmd
|
||||
status_check $? "${eval_cmd}" "${status_log}" "${model_name}" "${eval_log_path}"
|
||||
fi
|
||||
# run export model
|
||||
if [ ${run_export} != "null" ]; then
|
||||
# run export model
|
||||
save_infer_path="${save_log}"
|
||||
export_log_path="${LOG_PATH}/${trainer}_gpus_${gpu}_autocast_${autocast}_nodes_${nodes}_export.log"
|
||||
set_export_weight=$(func_set_params "${export_weight}" "${save_log}/${train_model_name}")
|
||||
set_save_infer_key=$(func_set_params "${save_infer_key}" "${save_infer_path}")
|
||||
export_cmd="${python} ${run_export} ${set_export_weight} ${set_save_infer_key} > ${export_log_path} 2>&1 "
|
||||
eval $export_cmd
|
||||
status_check $? "${export_cmd}" "${status_log}" "${model_name}" "${export_log_path}"
|
||||
|
||||
#run inference
|
||||
eval $env
|
||||
save_infer_path="${save_log}"
|
||||
if [[ ${inference_dir} != "null" ]] && [[ ${inference_dir} != '##' ]]; then
|
||||
infer_model_dir="${save_infer_path}/${inference_dir}"
|
||||
else
|
||||
infer_model_dir=${save_infer_path}
|
||||
fi
|
||||
func_inference "${python}" "${inference_py}" "${infer_model_dir}" "${LOG_PATH}" "${train_infer_img_dir}" "${flag_quant}" "${gpu}"
|
||||
|
||||
eval "unset CUDA_VISIBLE_DEVICES"
|
||||
fi
|
||||
done # done with: for trainer in ${trainer_list[*]}; do
|
||||
done # done with: for autocast in ${autocast_list[*]}; do
|
||||
done # done with: for gpu in ${gpu_list[*]}; do
|
||||
fi # end if [ ${MODE} = "infer" ]; then
|
||||
@ -0,0 +1,3 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/8 13:14
|
||||
# @Author : zhoujun
|
||||
93
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/eval.py
Normal file
93
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/eval.py
Normal file
@ -0,0 +1,93 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2018/6/11 15:54
|
||||
# @Author : zhoujun
|
||||
import os
|
||||
import sys
|
||||
import pathlib
|
||||
|
||||
__dir__ = pathlib.Path(os.path.abspath(__file__))
|
||||
sys.path.append(str(__dir__))
|
||||
sys.path.append(str(__dir__.parent.parent))
|
||||
|
||||
import argparse
|
||||
import time
|
||||
import paddle
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
|
||||
class EVAL:
|
||||
def __init__(self, model_path, gpu_id=0):
|
||||
from models import build_model
|
||||
from data_loader import get_dataloader
|
||||
from post_processing import get_post_processing
|
||||
from utils import get_metric
|
||||
|
||||
self.gpu_id = gpu_id
|
||||
if (
|
||||
self.gpu_id is not None
|
||||
and isinstance(self.gpu_id, int)
|
||||
and paddle.device.is_compiled_with_cuda()
|
||||
):
|
||||
paddle.device.set_device("gpu:{}".format(self.gpu_id))
|
||||
else:
|
||||
paddle.device.set_device("cpu")
|
||||
checkpoint = paddle.load(model_path)
|
||||
config = checkpoint["config"]
|
||||
config["arch"]["backbone"]["pretrained"] = False
|
||||
|
||||
self.validate_loader = get_dataloader(
|
||||
config["dataset"]["validate"], config["distributed"]
|
||||
)
|
||||
|
||||
self.model = build_model(config["arch"])
|
||||
self.model.set_state_dict(checkpoint["state_dict"])
|
||||
|
||||
self.post_process = get_post_processing(config["post_processing"])
|
||||
self.metric_cls = get_metric(config["metric"])
|
||||
|
||||
def eval(self):
|
||||
self.model.eval()
|
||||
raw_metrics = []
|
||||
total_frame = 0.0
|
||||
total_time = 0.0
|
||||
for i, batch in tqdm(
|
||||
enumerate(self.validate_loader),
|
||||
total=len(self.validate_loader),
|
||||
desc="test model",
|
||||
):
|
||||
with paddle.no_grad():
|
||||
start = time.time()
|
||||
preds = self.model(batch["img"])
|
||||
boxes, scores = self.post_process(
|
||||
batch, preds, is_output_polygon=self.metric_cls.is_output_polygon
|
||||
)
|
||||
total_frame += batch["img"].shape[0]
|
||||
total_time += time.time() - start
|
||||
raw_metric = self.metric_cls.validate_measure(batch, (boxes, scores))
|
||||
raw_metrics.append(raw_metric)
|
||||
metrics = self.metric_cls.gather_measure(raw_metrics)
|
||||
print("FPS:{}".format(total_frame / total_time))
|
||||
return {
|
||||
"recall": metrics["recall"].avg,
|
||||
"precision": metrics["precision"].avg,
|
||||
"fmeasure": metrics["fmeasure"].avg,
|
||||
}
|
||||
|
||||
|
||||
def init_args():
|
||||
parser = argparse.ArgumentParser(description="DBNet.paddle")
|
||||
parser.add_argument(
|
||||
"--model_path",
|
||||
required=False,
|
||||
default="output/DBNet_resnet18_FPN_DBHead/checkpoint/1.pth",
|
||||
type=str,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = init_args()
|
||||
eval = EVAL(args.model_path)
|
||||
result = eval.eval()
|
||||
print(result)
|
||||
@ -0,0 +1,57 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
__dir__ = os.path.dirname(os.path.abspath(__file__))
|
||||
sys.path.append(__dir__)
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(__dir__, "..")))
|
||||
|
||||
import argparse
|
||||
|
||||
import paddle
|
||||
from paddle.jit import to_static
|
||||
|
||||
from models import build_model
|
||||
from utils import Config, ArgsParser
|
||||
|
||||
|
||||
def init_args():
|
||||
parser = ArgsParser()
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def load_checkpoint(model, checkpoint_path):
|
||||
"""
|
||||
load checkpoints
|
||||
:param checkpoint_path: Checkpoint path to be loaded
|
||||
"""
|
||||
checkpoint = paddle.load(checkpoint_path)
|
||||
model.set_state_dict(checkpoint["state_dict"])
|
||||
print("load checkpoint from {}".format(checkpoint_path))
|
||||
|
||||
|
||||
def main(config):
|
||||
model = build_model(config["arch"])
|
||||
load_checkpoint(model, config["trainer"]["resume_checkpoint"])
|
||||
model.eval()
|
||||
|
||||
save_path = config["trainer"]["output_dir"]
|
||||
save_path = os.path.join(save_path, "inference")
|
||||
infer_shape = [3, -1, -1]
|
||||
model = to_static(
|
||||
model,
|
||||
input_spec=[
|
||||
paddle.static.InputSpec(shape=[None] + infer_shape, dtype="float32")
|
||||
],
|
||||
)
|
||||
|
||||
paddle.jit.save(model, save_path)
|
||||
print("inference model is saved to {}".format(save_path))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = init_args()
|
||||
assert os.path.exists(args.config_file)
|
||||
config = Config(args.config_file)
|
||||
config.merge_dict(args.opt)
|
||||
main(config.cfg)
|
||||
315
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/infer.py
Normal file
315
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/infer.py
Normal file
@ -0,0 +1,315 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pathlib
|
||||
|
||||
__dir__ = pathlib.Path(os.path.abspath(__file__))
|
||||
sys.path.append(str(__dir__))
|
||||
sys.path.append(str(__dir__.parent.parent))
|
||||
|
||||
import cv2
|
||||
import paddle
|
||||
from paddle import inference
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from paddle.vision import transforms
|
||||
from tools.predict import resize_image
|
||||
from post_processing import get_post_processing
|
||||
from utils.util import draw_bbox, save_result
|
||||
|
||||
|
||||
class InferenceEngine(object):
|
||||
"""InferenceEngine
|
||||
|
||||
Inference engine class which contains preprocess, run, postprocess
|
||||
"""
|
||||
|
||||
def __init__(self, args):
|
||||
"""
|
||||
Args:
|
||||
args: Parameters generated using argparser.
|
||||
Returns: None
|
||||
"""
|
||||
super().__init__()
|
||||
self.args = args
|
||||
|
||||
# init inference engine
|
||||
(
|
||||
self.predictor,
|
||||
self.config,
|
||||
self.input_tensor,
|
||||
self.output_tensor,
|
||||
) = self.load_predictor(
|
||||
os.path.join(args.model_dir, "inference.pdmodel"),
|
||||
os.path.join(args.model_dir, "inference.pdiparams"),
|
||||
)
|
||||
|
||||
# build transforms
|
||||
self.transforms = transforms.Compose(
|
||||
[
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize(
|
||||
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# wamrup
|
||||
if self.args.warmup > 0:
|
||||
for idx in range(args.warmup):
|
||||
print(idx)
|
||||
x = np.random.rand(
|
||||
1, 3, self.args.crop_size, self.args.crop_size
|
||||
).astype("float32")
|
||||
self.input_tensor.copy_from_cpu(x)
|
||||
self.predictor.run()
|
||||
self.output_tensor.copy_to_cpu()
|
||||
|
||||
self.post_process = get_post_processing(
|
||||
{
|
||||
"type": "SegDetectorRepresenter",
|
||||
"args": {
|
||||
"thresh": 0.3,
|
||||
"box_thresh": 0.7,
|
||||
"max_candidates": 1000,
|
||||
"unclip_ratio": 1.5,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
def load_predictor(self, model_file_path, params_file_path):
|
||||
"""load_predictor
|
||||
initialize the inference engine
|
||||
Args:
|
||||
model_file_path: inference model path (*.pdmodel)
|
||||
model_file_path: inference parameter path (*.pdiparams)
|
||||
Return:
|
||||
predictor: Predictor created using Paddle Inference.
|
||||
config: Configuration of the predictor.
|
||||
input_tensor: Input tensor of the predictor.
|
||||
output_tensor: Output tensor of the predictor.
|
||||
"""
|
||||
args = self.args
|
||||
config = inference.Config(model_file_path, params_file_path)
|
||||
if args.use_gpu:
|
||||
config.enable_use_gpu(1000, 0)
|
||||
if args.use_tensorrt:
|
||||
config.enable_tensorrt_engine(
|
||||
workspace_size=1 << 30,
|
||||
precision_mode=precision,
|
||||
max_batch_size=args.max_batch_size,
|
||||
min_subgraph_size=args.min_subgraph_size, # skip the minimum trt subgraph
|
||||
use_calib_mode=False,
|
||||
)
|
||||
|
||||
# collect shape
|
||||
trt_shape_f = os.path.join(model_dir, "_trt_dynamic_shape.txt")
|
||||
|
||||
if not os.path.exists(trt_shape_f):
|
||||
config.collect_shape_range_info(trt_shape_f)
|
||||
logger.info(f"collect dynamic shape info into : {trt_shape_f}")
|
||||
try:
|
||||
config.enable_tuned_tensorrt_dynamic_shape(trt_shape_f, True)
|
||||
except Exception as E:
|
||||
logger.info(E)
|
||||
logger.info("Please keep your paddlepaddle-gpu >= 2.3.0!")
|
||||
else:
|
||||
config.disable_gpu()
|
||||
# The thread num should not be greater than the number of cores in the CPU.
|
||||
if args.enable_mkldnn:
|
||||
# cache 10 different shapes for mkldnn to avoid memory leak
|
||||
config.set_mkldnn_cache_capacity(10)
|
||||
config.enable_mkldnn()
|
||||
if args.precision == "fp16":
|
||||
config.enable_mkldnn_bfloat16()
|
||||
if hasattr(args, "cpu_threads"):
|
||||
config.set_cpu_math_library_num_threads(args.cpu_threads)
|
||||
else:
|
||||
# default cpu threads as 10
|
||||
config.set_cpu_math_library_num_threads(10)
|
||||
|
||||
# enable memory optim
|
||||
config.enable_memory_optim()
|
||||
config.disable_glog_info()
|
||||
|
||||
config.switch_use_feed_fetch_ops(False)
|
||||
config.switch_ir_optim(True)
|
||||
|
||||
# create predictor
|
||||
predictor = inference.create_predictor(config)
|
||||
|
||||
# get input and output tensor property
|
||||
input_names = predictor.get_input_names()
|
||||
input_tensor = predictor.get_input_handle(input_names[0])
|
||||
|
||||
output_names = predictor.get_output_names()
|
||||
output_tensor = predictor.get_output_handle(output_names[0])
|
||||
|
||||
return predictor, config, input_tensor, output_tensor
|
||||
|
||||
def preprocess(self, img_path, short_size):
|
||||
"""preprocess
|
||||
Preprocess to the input.
|
||||
Args:
|
||||
img_path: Image path.
|
||||
Returns: Input data after preprocess.
|
||||
"""
|
||||
img = cv2.imread(img_path, 1)
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
h, w = img.shape[:2]
|
||||
img = resize_image(img, short_size)
|
||||
img = self.transforms(img)
|
||||
img = np.expand_dims(img, axis=0)
|
||||
shape_info = {"shape": [(h, w)]}
|
||||
return img, shape_info
|
||||
|
||||
def postprocess(self, x, shape_info, is_output_polygon):
|
||||
"""postprocess
|
||||
Postprocess to the inference engine output.
|
||||
Args:
|
||||
x: Inference engine output.
|
||||
Returns: Output data after argmax.
|
||||
"""
|
||||
box_list, score_list = self.post_process(
|
||||
shape_info, x, is_output_polygon=is_output_polygon
|
||||
)
|
||||
box_list, score_list = box_list[0], score_list[0]
|
||||
if len(box_list) > 0:
|
||||
if is_output_polygon:
|
||||
idx = [x.sum() > 0 for x in box_list]
|
||||
box_list = [box_list[i] for i, v in enumerate(idx) if v]
|
||||
score_list = [score_list[i] for i, v in enumerate(idx) if v]
|
||||
else:
|
||||
idx = (
|
||||
box_list.reshape(box_list.shape[0], -1).sum(axis=1) > 0
|
||||
) # 去掉全为0的框
|
||||
box_list, score_list = box_list[idx], score_list[idx]
|
||||
else:
|
||||
box_list, score_list = [], []
|
||||
return box_list, score_list
|
||||
|
||||
def run(self, x):
|
||||
"""run
|
||||
Inference process using inference engine.
|
||||
Args:
|
||||
x: Input data after preprocess.
|
||||
Returns: Inference engine output
|
||||
"""
|
||||
self.input_tensor.copy_from_cpu(x)
|
||||
self.predictor.run()
|
||||
output = self.output_tensor.copy_to_cpu()
|
||||
return output
|
||||
|
||||
|
||||
def get_args(add_help=True):
|
||||
"""
|
||||
parse args
|
||||
"""
|
||||
import argparse
|
||||
|
||||
def str2bool(v):
|
||||
return v.lower() in ("true", "t", "1")
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="PaddlePaddle Classification Training", add_help=add_help
|
||||
)
|
||||
|
||||
parser.add_argument("--model_dir", default=None, help="inference model dir")
|
||||
parser.add_argument("--batch_size", type=int, default=1)
|
||||
parser.add_argument("--short_size", default=1024, type=int, help="short size")
|
||||
parser.add_argument("--img_path", default="./images/demo.jpg")
|
||||
|
||||
parser.add_argument("--benchmark", default=False, type=str2bool, help="benchmark")
|
||||
parser.add_argument("--warmup", default=0, type=int, help="warmup iter")
|
||||
parser.add_argument("--polygon", action="store_true", help="output polygon or box")
|
||||
|
||||
parser.add_argument("--use_gpu", type=str2bool, default=True)
|
||||
parser.add_argument("--use_tensorrt", type=str2bool, default=False)
|
||||
parser.add_argument("--precision", type=str, default="fp32")
|
||||
parser.add_argument("--gpu_mem", type=int, default=500)
|
||||
parser.add_argument("--gpu_id", type=int, default=0)
|
||||
parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
|
||||
parser.add_argument("--cpu_threads", type=int, default=10)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main(args):
|
||||
"""
|
||||
Main inference function.
|
||||
Args:
|
||||
args: Parameters generated using argparser.
|
||||
Returns:
|
||||
class_id: Class index of the input.
|
||||
prob: : Probability of the input.
|
||||
"""
|
||||
inference_engine = InferenceEngine(args)
|
||||
|
||||
# init benchmark
|
||||
if args.benchmark:
|
||||
import auto_log
|
||||
|
||||
autolog = auto_log.AutoLogger(
|
||||
model_name="db",
|
||||
batch_size=args.batch_size,
|
||||
inference_config=inference_engine.config,
|
||||
gpu_ids="auto" if args.use_gpu else None,
|
||||
)
|
||||
|
||||
# enable benchmark
|
||||
if args.benchmark:
|
||||
autolog.times.start()
|
||||
|
||||
# preprocess
|
||||
img, shape_info = inference_engine.preprocess(args.img_path, args.short_size)
|
||||
|
||||
if args.benchmark:
|
||||
autolog.times.stamp()
|
||||
|
||||
output = inference_engine.run(img)
|
||||
|
||||
if args.benchmark:
|
||||
autolog.times.stamp()
|
||||
|
||||
# postprocess
|
||||
box_list, score_list = inference_engine.postprocess(
|
||||
output, shape_info, args.polygon
|
||||
)
|
||||
|
||||
if args.benchmark:
|
||||
autolog.times.stamp()
|
||||
autolog.times.end(stamp=True)
|
||||
autolog.report()
|
||||
|
||||
img = draw_bbox(cv2.imread(args.img_path)[:, :, ::-1], box_list)
|
||||
# 保存结果到路径
|
||||
os.makedirs("output", exist_ok=True)
|
||||
img_path = pathlib.Path(args.img_path)
|
||||
output_path = os.path.join("output", img_path.stem + "_infer_result.jpg")
|
||||
cv2.imwrite(output_path, img[:, :, ::-1])
|
||||
save_result(
|
||||
output_path.replace("_infer_result.jpg", ".txt"),
|
||||
box_list,
|
||||
score_list,
|
||||
args.polygon,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
main(args)
|
||||
175
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/predict.py
Normal file
175
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/predict.py
Normal file
@ -0,0 +1,175 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/24 12:06
|
||||
# @Author : zhoujun
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pathlib
|
||||
|
||||
__dir__ = pathlib.Path(os.path.abspath(__file__))
|
||||
sys.path.append(str(__dir__))
|
||||
sys.path.append(str(__dir__.parent.parent))
|
||||
|
||||
import time
|
||||
import cv2
|
||||
import paddle
|
||||
|
||||
from data_loader import get_transforms
|
||||
from models import build_model
|
||||
from post_processing import get_post_processing
|
||||
|
||||
|
||||
def resize_image(img, short_size):
|
||||
height, width, _ = img.shape
|
||||
if height < width:
|
||||
new_height = short_size
|
||||
new_width = new_height / height * width
|
||||
else:
|
||||
new_width = short_size
|
||||
new_height = new_width / width * height
|
||||
new_height = int(round(new_height / 32) * 32)
|
||||
new_width = int(round(new_width / 32) * 32)
|
||||
resized_img = cv2.resize(img, (new_width, new_height))
|
||||
return resized_img
|
||||
|
||||
|
||||
class PaddleModel:
|
||||
def __init__(self, model_path, post_p_thre=0.7, gpu_id=None):
|
||||
"""
|
||||
初始化模型
|
||||
:param model_path: 模型地址(可以是模型的参数或者参数和计算图一起保存的文件)
|
||||
:param gpu_id: 在哪一块gpu上运行
|
||||
"""
|
||||
self.gpu_id = gpu_id
|
||||
|
||||
if (
|
||||
self.gpu_id is not None
|
||||
and isinstance(self.gpu_id, int)
|
||||
and paddle.device.is_compiled_with_cuda()
|
||||
):
|
||||
paddle.device.set_device("gpu:{}".format(self.gpu_id))
|
||||
else:
|
||||
paddle.device.set_device("cpu")
|
||||
checkpoint = paddle.load(model_path)
|
||||
|
||||
config = checkpoint["config"]
|
||||
config["arch"]["backbone"]["pretrained"] = False
|
||||
self.model = build_model(config["arch"])
|
||||
self.post_process = get_post_processing(config["post_processing"])
|
||||
self.post_process.box_thresh = post_p_thre
|
||||
self.img_mode = config["dataset"]["train"]["dataset"]["args"]["img_mode"]
|
||||
self.model.set_state_dict(checkpoint["state_dict"])
|
||||
self.model.eval()
|
||||
|
||||
self.transform = []
|
||||
for t in config["dataset"]["train"]["dataset"]["args"]["transforms"]:
|
||||
if t["type"] in ["ToTensor", "Normalize"]:
|
||||
self.transform.append(t)
|
||||
self.transform = get_transforms(self.transform)
|
||||
|
||||
def predict(self, img_path: str, is_output_polygon=False, short_size: int = 1024):
|
||||
"""
|
||||
对传入的图像进行预测,支持图像地址,opencv 读取图片,偏慢
|
||||
:param img_path: 图像地址
|
||||
:param is_numpy:
|
||||
:return:
|
||||
"""
|
||||
assert os.path.exists(img_path), "file is not exists"
|
||||
img = cv2.imread(img_path, 1 if self.img_mode != "GRAY" else 0)
|
||||
if self.img_mode == "RGB":
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
h, w = img.shape[:2]
|
||||
img = resize_image(img, short_size)
|
||||
# 将图片由(w,h)变为(1,img_channel,h,w)
|
||||
tensor = self.transform(img)
|
||||
tensor = tensor.unsqueeze_(0)
|
||||
|
||||
batch = {"shape": [(h, w)]}
|
||||
with paddle.no_grad():
|
||||
start = time.time()
|
||||
preds = self.model(tensor)
|
||||
box_list, score_list = self.post_process(
|
||||
batch, preds, is_output_polygon=is_output_polygon
|
||||
)
|
||||
box_list, score_list = box_list[0], score_list[0]
|
||||
if len(box_list) > 0:
|
||||
if is_output_polygon:
|
||||
idx = [x.sum() > 0 for x in box_list]
|
||||
box_list = [box_list[i] for i, v in enumerate(idx) if v]
|
||||
score_list = [score_list[i] for i, v in enumerate(idx) if v]
|
||||
else:
|
||||
idx = (
|
||||
box_list.reshape(box_list.shape[0], -1).sum(axis=1) > 0
|
||||
) # 去掉全为0的框
|
||||
box_list, score_list = box_list[idx], score_list[idx]
|
||||
else:
|
||||
box_list, score_list = [], []
|
||||
t = time.time() - start
|
||||
return preds[0, 0, :, :].detach().cpu().numpy(), box_list, score_list, t
|
||||
|
||||
|
||||
def save_depoly(net, input, save_path):
|
||||
input_spec = [paddle.static.InputSpec(shape=[None, 3, None, None], dtype="float32")]
|
||||
net = paddle.jit.to_static(net, input_spec=input_spec)
|
||||
|
||||
# save static model for inference directly
|
||||
paddle.jit.save(net, save_path)
|
||||
|
||||
|
||||
def init_args():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="DBNet.paddle")
|
||||
parser.add_argument("--model_path", default=r"model_best.pth", type=str)
|
||||
parser.add_argument(
|
||||
"--input_folder", default="./test/input", type=str, help="img path for predict"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output_folder", default="./test/output", type=str, help="img path for output"
|
||||
)
|
||||
parser.add_argument("--gpu", default=0, type=int, help="gpu for inference")
|
||||
parser.add_argument(
|
||||
"--thre", default=0.3, type=float, help="the thresh of post_processing"
|
||||
)
|
||||
parser.add_argument("--polygon", action="store_true", help="output polygon or box")
|
||||
parser.add_argument("--show", action="store_true", help="show result")
|
||||
parser.add_argument(
|
||||
"--save_result", action="store_true", help="save box and score to txt file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pathlib
|
||||
from tqdm import tqdm
|
||||
import matplotlib.pyplot as plt
|
||||
from utils.util import show_img, draw_bbox, save_result, get_image_file_list
|
||||
|
||||
args = init_args()
|
||||
print(args)
|
||||
# 初始化网络
|
||||
model = PaddleModel(args.model_path, post_p_thre=args.thre, gpu_id=args.gpu)
|
||||
img_folder = pathlib.Path(args.input_folder)
|
||||
for img_path in tqdm(get_image_file_list(args.input_folder)):
|
||||
preds, boxes_list, score_list, t = model.predict(
|
||||
img_path, is_output_polygon=args.polygon
|
||||
)
|
||||
img = draw_bbox(cv2.imread(img_path)[:, :, ::-1], boxes_list)
|
||||
if args.show:
|
||||
show_img(preds)
|
||||
show_img(img, title=os.path.basename(img_path))
|
||||
plt.show()
|
||||
# 保存结果到路径
|
||||
os.makedirs(args.output_folder, exist_ok=True)
|
||||
img_path = pathlib.Path(img_path)
|
||||
output_path = os.path.join(args.output_folder, img_path.stem + "_result.jpg")
|
||||
pred_path = os.path.join(args.output_folder, img_path.stem + "_pred.jpg")
|
||||
cv2.imwrite(output_path, img[:, :, ::-1])
|
||||
cv2.imwrite(pred_path, preds * 255)
|
||||
save_result(
|
||||
output_path.replace("_result.jpg", ".txt"),
|
||||
boxes_list,
|
||||
score_list,
|
||||
args.polygon,
|
||||
)
|
||||
64
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/train.py
Normal file
64
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/tools/train.py
Normal file
@ -0,0 +1,64 @@
|
||||
import os
|
||||
import sys
|
||||
import pathlib
|
||||
|
||||
__dir__ = pathlib.Path(os.path.abspath(__file__))
|
||||
sys.path.append(str(__dir__))
|
||||
sys.path.append(str(__dir__.parent.parent))
|
||||
|
||||
import paddle
|
||||
import paddle.distributed as dist
|
||||
from utils import Config, ArgsParser
|
||||
|
||||
|
||||
def init_args():
|
||||
parser = ArgsParser()
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def main(config, profiler_options):
|
||||
from models import build_model, build_loss
|
||||
from data_loader import get_dataloader
|
||||
from trainer import Trainer
|
||||
from post_processing import get_post_processing
|
||||
from utils import get_metric
|
||||
|
||||
if paddle.device.cuda.device_count() > 1:
|
||||
dist.init_parallel_env()
|
||||
config["distributed"] = True
|
||||
else:
|
||||
config["distributed"] = False
|
||||
train_loader = get_dataloader(config["dataset"]["train"], config["distributed"])
|
||||
assert train_loader is not None
|
||||
if "validate" in config["dataset"]:
|
||||
validate_loader = get_dataloader(config["dataset"]["validate"], False)
|
||||
else:
|
||||
validate_loader = None
|
||||
criterion = build_loss(config["loss"])
|
||||
config["arch"]["backbone"]["in_channels"] = (
|
||||
3 if config["dataset"]["train"]["dataset"]["args"]["img_mode"] != "GRAY" else 1
|
||||
)
|
||||
model = build_model(config["arch"])
|
||||
# set @to_static for benchmark, skip this by default.
|
||||
post_p = get_post_processing(config["post_processing"])
|
||||
metric = get_metric(config["metric"])
|
||||
trainer = Trainer(
|
||||
config=config,
|
||||
model=model,
|
||||
criterion=criterion,
|
||||
train_loader=train_loader,
|
||||
post_process=post_p,
|
||||
metric_cls=metric,
|
||||
validate_loader=validate_loader,
|
||||
profiler_options=profiler_options,
|
||||
)
|
||||
trainer.train()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = init_args()
|
||||
assert os.path.exists(args.config_file)
|
||||
config = Config(args.config_file)
|
||||
config.merge_dict(args.opt)
|
||||
main(config.cfg, args.profiler_options)
|
||||
@ -0,0 +1,4 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:58
|
||||
# @Author : zhoujun
|
||||
from .trainer import Trainer
|
||||
256
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/trainer/trainer.py
Normal file
256
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/trainer/trainer.py
Normal file
@ -0,0 +1,256 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:58
|
||||
# @Author : zhoujun
|
||||
import time
|
||||
|
||||
import paddle
|
||||
from tqdm import tqdm
|
||||
|
||||
from base import BaseTrainer
|
||||
from utils import runningScore, cal_text_score, Polynomial, profiler
|
||||
|
||||
|
||||
class Trainer(BaseTrainer):
|
||||
def __init__(
|
||||
self,
|
||||
config,
|
||||
model,
|
||||
criterion,
|
||||
train_loader,
|
||||
validate_loader,
|
||||
metric_cls,
|
||||
post_process=None,
|
||||
profiler_options=None,
|
||||
):
|
||||
super(Trainer, self).__init__(
|
||||
config,
|
||||
model,
|
||||
criterion,
|
||||
train_loader,
|
||||
validate_loader,
|
||||
metric_cls,
|
||||
post_process,
|
||||
)
|
||||
self.profiler_options = profiler_options
|
||||
self.enable_eval = config["trainer"].get("enable_eval", True)
|
||||
|
||||
def _train_epoch(self, epoch):
|
||||
self.model.train()
|
||||
total_samples = 0
|
||||
train_reader_cost = 0.0
|
||||
train_batch_cost = 0.0
|
||||
reader_start = time.time()
|
||||
epoch_start = time.time()
|
||||
train_loss = 0.0
|
||||
running_metric_text = runningScore(2)
|
||||
|
||||
for i, batch in enumerate(self.train_loader):
|
||||
profiler.add_profiler_step(self.profiler_options)
|
||||
if i >= self.train_loader_len:
|
||||
break
|
||||
self.global_step += 1
|
||||
lr = self.optimizer.get_lr()
|
||||
|
||||
cur_batch_size = batch["img"].shape[0]
|
||||
|
||||
train_reader_cost += time.time() - reader_start
|
||||
if self.amp:
|
||||
with paddle.amp.auto_cast(
|
||||
enable="gpu" in paddle.device.get_device(),
|
||||
custom_white_list=self.amp.get("custom_white_list", []),
|
||||
custom_black_list=self.amp.get("custom_black_list", []),
|
||||
level=self.amp.get("level", "O2"),
|
||||
):
|
||||
preds = self.model(batch["img"])
|
||||
loss_dict = self.criterion(preds.astype(paddle.float32), batch)
|
||||
scaled_loss = self.amp["scaler"].scale(loss_dict["loss"])
|
||||
scaled_loss.backward()
|
||||
self.amp["scaler"].minimize(self.optimizer, scaled_loss)
|
||||
else:
|
||||
preds = self.model(batch["img"])
|
||||
loss_dict = self.criterion(preds, batch)
|
||||
# backward
|
||||
loss_dict["loss"].backward()
|
||||
self.optimizer.step()
|
||||
self.lr_scheduler.step()
|
||||
self.optimizer.clear_grad()
|
||||
|
||||
train_batch_time = time.time() - reader_start
|
||||
train_batch_cost += train_batch_time
|
||||
total_samples += cur_batch_size
|
||||
|
||||
# acc iou
|
||||
score_shrink_map = cal_text_score(
|
||||
preds[:, 0, :, :],
|
||||
batch["shrink_map"],
|
||||
batch["shrink_mask"],
|
||||
running_metric_text,
|
||||
thred=self.config["post_processing"]["args"]["thresh"],
|
||||
)
|
||||
|
||||
# loss 和 acc 记录到日志
|
||||
loss_str = "loss: {:.4f}, ".format(loss_dict["loss"].item())
|
||||
for idx, (key, value) in enumerate(loss_dict.items()):
|
||||
loss_dict[key] = value.item()
|
||||
if key == "loss":
|
||||
continue
|
||||
loss_str += "{}: {:.4f}".format(key, loss_dict[key])
|
||||
if idx < len(loss_dict) - 1:
|
||||
loss_str += ", "
|
||||
|
||||
train_loss += loss_dict["loss"]
|
||||
acc = score_shrink_map["Mean Acc"]
|
||||
iou_shrink_map = score_shrink_map["Mean IoU"]
|
||||
|
||||
if self.global_step % self.log_iter == 0:
|
||||
self.logger_info(
|
||||
"[{}/{}], [{}/{}], global_step: {}, ips: {:.1f} samples/sec, avg_reader_cost: {:.5f} s, avg_batch_cost: {:.5f} s, avg_samples: {}, acc: {:.4f}, iou_shrink_map: {:.4f}, {}lr:{:.6}, time:{:.2f}".format(
|
||||
epoch,
|
||||
self.epochs,
|
||||
i + 1,
|
||||
self.train_loader_len,
|
||||
self.global_step,
|
||||
total_samples / train_batch_cost,
|
||||
train_reader_cost / self.log_iter,
|
||||
train_batch_cost / self.log_iter,
|
||||
total_samples / self.log_iter,
|
||||
acc,
|
||||
iou_shrink_map,
|
||||
loss_str,
|
||||
lr,
|
||||
train_batch_cost,
|
||||
)
|
||||
)
|
||||
total_samples = 0
|
||||
train_reader_cost = 0.0
|
||||
train_batch_cost = 0.0
|
||||
|
||||
if self.visualdl_enable and paddle.distributed.get_rank() == 0:
|
||||
# write tensorboard
|
||||
for key, value in loss_dict.items():
|
||||
self.writer.add_scalar(
|
||||
"TRAIN/LOSS/{}".format(key), value, self.global_step
|
||||
)
|
||||
self.writer.add_scalar("TRAIN/ACC_IOU/acc", acc, self.global_step)
|
||||
self.writer.add_scalar(
|
||||
"TRAIN/ACC_IOU/iou_shrink_map", iou_shrink_map, self.global_step
|
||||
)
|
||||
self.writer.add_scalar("TRAIN/lr", lr, self.global_step)
|
||||
reader_start = time.time()
|
||||
return {
|
||||
"train_loss": train_loss / self.train_loader_len,
|
||||
"lr": lr,
|
||||
"time": time.time() - epoch_start,
|
||||
"epoch": epoch,
|
||||
}
|
||||
|
||||
def _eval(self, epoch):
|
||||
self.model.eval()
|
||||
raw_metrics = []
|
||||
total_frame = 0.0
|
||||
total_time = 0.0
|
||||
for i, batch in tqdm(
|
||||
enumerate(self.validate_loader),
|
||||
total=len(self.validate_loader),
|
||||
desc="test model",
|
||||
):
|
||||
with paddle.no_grad():
|
||||
start = time.time()
|
||||
if self.amp:
|
||||
with paddle.amp.auto_cast(
|
||||
enable="gpu" in paddle.device.get_device(),
|
||||
custom_white_list=self.amp.get("custom_white_list", []),
|
||||
custom_black_list=self.amp.get("custom_black_list", []),
|
||||
level=self.amp.get("level", "O2"),
|
||||
):
|
||||
preds = self.model(batch["img"])
|
||||
preds = preds.astype(paddle.float32)
|
||||
else:
|
||||
preds = self.model(batch["img"])
|
||||
boxes, scores = self.post_process(
|
||||
batch, preds, is_output_polygon=self.metric_cls.is_output_polygon
|
||||
)
|
||||
total_frame += batch["img"].shape[0]
|
||||
total_time += time.time() - start
|
||||
raw_metric = self.metric_cls.validate_measure(batch, (boxes, scores))
|
||||
raw_metrics.append(raw_metric)
|
||||
metrics = self.metric_cls.gather_measure(raw_metrics)
|
||||
self.logger_info("FPS:{}".format(total_frame / total_time))
|
||||
return metrics["recall"].avg, metrics["precision"].avg, metrics["fmeasure"].avg
|
||||
|
||||
def _on_epoch_finish(self):
|
||||
self.logger_info(
|
||||
"[{}/{}], train_loss: {:.4f}, time: {:.4f}, lr: {}".format(
|
||||
self.epoch_result["epoch"],
|
||||
self.epochs,
|
||||
self.epoch_result["train_loss"],
|
||||
self.epoch_result["time"],
|
||||
self.epoch_result["lr"],
|
||||
)
|
||||
)
|
||||
net_save_path = "{}/model_latest.pth".format(self.checkpoint_dir)
|
||||
net_save_path_best = "{}/model_best.pth".format(self.checkpoint_dir)
|
||||
|
||||
if paddle.distributed.get_rank() == 0:
|
||||
self._save_checkpoint(self.epoch_result["epoch"], net_save_path)
|
||||
save_best = False
|
||||
if (
|
||||
self.validate_loader is not None
|
||||
and self.metric_cls is not None
|
||||
and self.enable_eval
|
||||
): # 使用f1作为最优模型指标
|
||||
recall, precision, hmean = self._eval(self.epoch_result["epoch"])
|
||||
|
||||
if self.visualdl_enable:
|
||||
self.writer.add_scalar("EVAL/recall", recall, self.global_step)
|
||||
self.writer.add_scalar(
|
||||
"EVAL/precision", precision, self.global_step
|
||||
)
|
||||
self.writer.add_scalar("EVAL/hmean", hmean, self.global_step)
|
||||
self.logger_info(
|
||||
"test: recall: {:.6f}, precision: {:.6f}, hmean: {:.6f}".format(
|
||||
recall, precision, hmean
|
||||
)
|
||||
)
|
||||
|
||||
if hmean >= self.metrics["hmean"]:
|
||||
save_best = True
|
||||
self.metrics["train_loss"] = self.epoch_result["train_loss"]
|
||||
self.metrics["hmean"] = hmean
|
||||
self.metrics["precision"] = precision
|
||||
self.metrics["recall"] = recall
|
||||
self.metrics["best_model_epoch"] = self.epoch_result["epoch"]
|
||||
else:
|
||||
if self.epoch_result["train_loss"] <= self.metrics["train_loss"]:
|
||||
save_best = True
|
||||
self.metrics["train_loss"] = self.epoch_result["train_loss"]
|
||||
self.metrics["best_model_epoch"] = self.epoch_result["epoch"]
|
||||
best_str = "current best, "
|
||||
for k, v in self.metrics.items():
|
||||
best_str += "{}: {:.6f}, ".format(k, v)
|
||||
self.logger_info(best_str)
|
||||
if save_best:
|
||||
import shutil
|
||||
|
||||
shutil.copy(net_save_path, net_save_path_best)
|
||||
self.logger_info("Saving current best: {}".format(net_save_path_best))
|
||||
else:
|
||||
self.logger_info("Saving checkpoint: {}".format(net_save_path))
|
||||
|
||||
def _on_train_finish(self):
|
||||
if self.enable_eval:
|
||||
for k, v in self.metrics.items():
|
||||
self.logger_info("{}:{}".format(k, v))
|
||||
self.logger_info("finish train")
|
||||
|
||||
def _initialize_scheduler(self):
|
||||
if self.config["lr_scheduler"]["type"] == "Polynomial":
|
||||
self.config["lr_scheduler"]["args"]["epochs"] = self.config["trainer"][
|
||||
"epochs"
|
||||
]
|
||||
self.config["lr_scheduler"]["args"]["step_each_epoch"] = len(
|
||||
self.train_loader
|
||||
)
|
||||
self.lr_scheduler = Polynomial(**self.config["lr_scheduler"]["args"])()
|
||||
else:
|
||||
self.lr_scheduler = self._initialize("lr_scheduler", paddle.optimizer.lr)
|
||||
@ -0,0 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:58
|
||||
# @Author : zhoujun
|
||||
from .util import *
|
||||
from .metrics import *
|
||||
from .schedulers import *
|
||||
from .cal_recall.script import cal_recall_precision_f1
|
||||
from .ocr_metric import get_metric
|
||||
@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 1/16/19 6:40 AM
|
||||
# @Author : zhoujun
|
||||
from .script import cal_recall_precision_f1
|
||||
|
||||
__all__ = ["cal_recall_precision_f1"]
|
||||
@ -0,0 +1,494 @@
|
||||
#!/usr/bin/env python2
|
||||
# encoding: UTF-8
|
||||
import json
|
||||
import sys
|
||||
|
||||
sys.path.append("./")
|
||||
import zipfile
|
||||
import re
|
||||
import sys
|
||||
import os
|
||||
import codecs
|
||||
import traceback
|
||||
import numpy as np
|
||||
from utils import order_points_clockwise
|
||||
|
||||
|
||||
def print_help():
|
||||
sys.stdout.write(
|
||||
"Usage: python %s.py -g=<gtFile> -s=<submFile> [-o=<outputFolder> -p=<jsonParams>]"
|
||||
% sys.argv[0]
|
||||
)
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
def load_zip_file_keys(file, fileNameRegExp=""):
|
||||
"""
|
||||
Returns an array with the entries of the ZIP file that match with the regular expression.
|
||||
The key's are the names or the file or the capturing group defined in the fileNameRegExp
|
||||
"""
|
||||
try:
|
||||
archive = zipfile.ZipFile(file, mode="r", allowZip64=True)
|
||||
except:
|
||||
raise Exception("Error loading the ZIP archive.")
|
||||
|
||||
pairs = []
|
||||
|
||||
for name in archive.namelist():
|
||||
addFile = True
|
||||
keyName = name
|
||||
if fileNameRegExp != "":
|
||||
m = re.match(fileNameRegExp, name)
|
||||
if m == None:
|
||||
addFile = False
|
||||
else:
|
||||
if len(m.groups()) > 0:
|
||||
keyName = m.group(1)
|
||||
|
||||
if addFile:
|
||||
pairs.append(keyName)
|
||||
|
||||
return pairs
|
||||
|
||||
|
||||
def load_zip_file(file, fileNameRegExp="", allEntries=False):
|
||||
"""
|
||||
Returns an array with the contents (filtered by fileNameRegExp) of a ZIP file.
|
||||
The key's are the names or the file or the capturing group defined in the fileNameRegExp
|
||||
allEntries validates that all entries in the ZIP file pass the fileNameRegExp
|
||||
"""
|
||||
try:
|
||||
archive = zipfile.ZipFile(file, mode="r", allowZip64=True)
|
||||
except:
|
||||
raise Exception("Error loading the ZIP archive")
|
||||
|
||||
pairs = []
|
||||
for name in archive.namelist():
|
||||
addFile = True
|
||||
keyName = name
|
||||
if fileNameRegExp != "":
|
||||
m = re.match(fileNameRegExp, name)
|
||||
if m == None:
|
||||
addFile = False
|
||||
else:
|
||||
if len(m.groups()) > 0:
|
||||
keyName = m.group(1)
|
||||
|
||||
if addFile:
|
||||
pairs.append([keyName, archive.read(name)])
|
||||
else:
|
||||
if allEntries:
|
||||
raise Exception("ZIP entry not valid: %s" % name)
|
||||
|
||||
return dict(pairs)
|
||||
|
||||
|
||||
def load_folder_file(file, fileNameRegExp="", allEntries=False):
|
||||
"""
|
||||
Returns an array with the contents (filtered by fileNameRegExp) of a ZIP file.
|
||||
The key's are the names or the file or the capturing group defined in the fileNameRegExp
|
||||
allEntries validates that all entries in the ZIP file pass the fileNameRegExp
|
||||
"""
|
||||
pairs = []
|
||||
for name in os.listdir(file):
|
||||
addFile = True
|
||||
keyName = name
|
||||
if fileNameRegExp != "":
|
||||
m = re.match(fileNameRegExp, name)
|
||||
if m == None:
|
||||
addFile = False
|
||||
else:
|
||||
if len(m.groups()) > 0:
|
||||
keyName = m.group(1)
|
||||
|
||||
if addFile:
|
||||
pairs.append([keyName, open(os.path.join(file, name)).read()])
|
||||
else:
|
||||
if allEntries:
|
||||
raise Exception("ZIP entry not valid: %s" % name)
|
||||
|
||||
return dict(pairs)
|
||||
|
||||
|
||||
def decode_utf8(raw):
|
||||
"""
|
||||
Returns a Unicode object on success, or None on failure
|
||||
"""
|
||||
try:
|
||||
raw = codecs.decode(raw, "utf-8", "replace")
|
||||
# extracts BOM if exists
|
||||
raw = raw.encode("utf8")
|
||||
if raw.startswith(codecs.BOM_UTF8):
|
||||
raw = raw.replace(codecs.BOM_UTF8, "", 1)
|
||||
return raw.decode("utf-8")
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def validate_lines_in_file(
|
||||
fileName,
|
||||
file_contents,
|
||||
CRLF=True,
|
||||
LTRB=True,
|
||||
withTranscription=False,
|
||||
withConfidence=False,
|
||||
imWidth=0,
|
||||
imHeight=0,
|
||||
):
|
||||
"""
|
||||
This function validates that all lines of the file calling the Line validation function for each line
|
||||
"""
|
||||
utf8File = decode_utf8(file_contents)
|
||||
if utf8File is None:
|
||||
raise Exception("The file %s is not UTF-8" % fileName)
|
||||
|
||||
lines = utf8File.split("\r\n" if CRLF else "\n")
|
||||
for line in lines:
|
||||
line = line.replace("\r", "").replace("\n", "")
|
||||
if line != "":
|
||||
try:
|
||||
validate_tl_line(
|
||||
line, LTRB, withTranscription, withConfidence, imWidth, imHeight
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception(
|
||||
(
|
||||
"Line in sample not valid. Sample: %s Line: %s Error: %s"
|
||||
% (fileName, line, str(e))
|
||||
).encode("utf-8", "replace")
|
||||
)
|
||||
|
||||
|
||||
def validate_tl_line(
|
||||
line, LTRB=True, withTranscription=True, withConfidence=True, imWidth=0, imHeight=0
|
||||
):
|
||||
"""
|
||||
Validate the format of the line. If the line is not valid an exception will be raised.
|
||||
If maxWidth and maxHeight are specified, all points must be inside the image bounds.
|
||||
Possible values are:
|
||||
LTRB=True: xmin,ymin,xmax,ymax[,confidence][,transcription]
|
||||
LTRB=False: x1,y1,x2,y2,x3,y3,x4,y4[,confidence][,transcription]
|
||||
"""
|
||||
get_tl_line_values(line, LTRB, withTranscription, withConfidence, imWidth, imHeight)
|
||||
|
||||
|
||||
def get_tl_line_values(
|
||||
line,
|
||||
LTRB=True,
|
||||
withTranscription=False,
|
||||
withConfidence=False,
|
||||
imWidth=0,
|
||||
imHeight=0,
|
||||
):
|
||||
"""
|
||||
Validate the format of the line. If the line is not valid an exception will be raised.
|
||||
If maxWidth and maxHeight are specified, all points must be inside the image bounds.
|
||||
Possible values are:
|
||||
LTRB=True: xmin,ymin,xmax,ymax[,confidence][,transcription]
|
||||
LTRB=False: x1,y1,x2,y2,x3,y3,x4,y4[,confidence][,transcription]
|
||||
Returns values from a textline. Points , [Confidences], [Transcriptions]
|
||||
"""
|
||||
confidence = 0.0
|
||||
transcription = ""
|
||||
points = []
|
||||
|
||||
numPoints = 4
|
||||
|
||||
if LTRB:
|
||||
numPoints = 4
|
||||
|
||||
if withTranscription and withConfidence:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$",
|
||||
line,
|
||||
)
|
||||
raise Exception(
|
||||
"Format incorrect. Should be: xmin,ymin,xmax,ymax,confidence,transcription"
|
||||
)
|
||||
elif withConfidence:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-1].?[0-9]*)\s*$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
raise Exception(
|
||||
"Format incorrect. Should be: xmin,ymin,xmax,ymax,confidence"
|
||||
)
|
||||
elif withTranscription:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,(.*)$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
raise Exception(
|
||||
"Format incorrect. Should be: xmin,ymin,xmax,ymax,transcription"
|
||||
)
|
||||
else:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-9]+)\s*,\s*([0-9]+)\s*,?\s*$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
raise Exception("Format incorrect. Should be: xmin,ymin,xmax,ymax")
|
||||
|
||||
xmin = int(m.group(1))
|
||||
ymin = int(m.group(2))
|
||||
xmax = int(m.group(3))
|
||||
ymax = int(m.group(4))
|
||||
if xmax < xmin:
|
||||
raise Exception("Xmax value (%s) not valid (Xmax < Xmin)." % (xmax))
|
||||
if ymax < ymin:
|
||||
raise Exception("Ymax value (%s) not valid (Ymax < Ymin)." % (ymax))
|
||||
|
||||
points = [float(m.group(i)) for i in range(1, (numPoints + 1))]
|
||||
|
||||
if imWidth > 0 and imHeight > 0:
|
||||
validate_point_inside_bounds(xmin, ymin, imWidth, imHeight)
|
||||
validate_point_inside_bounds(xmax, ymax, imWidth, imHeight)
|
||||
|
||||
else:
|
||||
numPoints = 8
|
||||
|
||||
if withTranscription and withConfidence:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-1].?[0-9]*)\s*,(.*)$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
raise Exception(
|
||||
"Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,confidence,transcription"
|
||||
)
|
||||
elif withConfidence:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*([0-1].?[0-9]*)\s*$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
raise Exception(
|
||||
"Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,confidence"
|
||||
)
|
||||
elif withTranscription:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,(.*)$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
raise Exception(
|
||||
"Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4,transcription"
|
||||
)
|
||||
else:
|
||||
m = re.match(
|
||||
r"^\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*,\s*(-?[0-9]+)\s*$",
|
||||
line,
|
||||
)
|
||||
if m == None:
|
||||
raise Exception("Format incorrect. Should be: x1,y1,x2,y2,x3,y3,x4,y4")
|
||||
|
||||
points = [float(m.group(i)) for i in range(1, (numPoints + 1))]
|
||||
|
||||
points = order_points_clockwise(np.array(points).reshape(-1, 2)).reshape(-1)
|
||||
validate_clockwise_points(points)
|
||||
|
||||
if imWidth > 0 and imHeight > 0:
|
||||
validate_point_inside_bounds(points[0], points[1], imWidth, imHeight)
|
||||
validate_point_inside_bounds(points[2], points[3], imWidth, imHeight)
|
||||
validate_point_inside_bounds(points[4], points[5], imWidth, imHeight)
|
||||
validate_point_inside_bounds(points[6], points[7], imWidth, imHeight)
|
||||
|
||||
if withConfidence:
|
||||
try:
|
||||
confidence = float(m.group(numPoints + 1))
|
||||
except ValueError:
|
||||
raise Exception("Confidence value must be a float")
|
||||
|
||||
if withTranscription:
|
||||
posTranscription = numPoints + (2 if withConfidence else 1)
|
||||
transcription = m.group(posTranscription)
|
||||
m2 = re.match(r"^\s*\"(.*)\"\s*$", transcription)
|
||||
if (
|
||||
m2 != None
|
||||
): # Transcription with double quotes, we extract the value and replace escaped characters
|
||||
transcription = m2.group(1).replace("\\\\", "\\").replace('\\"', '"')
|
||||
|
||||
return points, confidence, transcription
|
||||
|
||||
|
||||
def validate_point_inside_bounds(x, y, imWidth, imHeight):
|
||||
if x < 0 or x > imWidth:
|
||||
raise Exception(
|
||||
"X value (%s) not valid. Image dimensions: (%s,%s)"
|
||||
% (xmin, imWidth, imHeight)
|
||||
)
|
||||
if y < 0 or y > imHeight:
|
||||
raise Exception(
|
||||
"Y value (%s) not valid. Image dimensions: (%s,%s) Sample: %s Line:%s"
|
||||
% (ymin, imWidth, imHeight)
|
||||
)
|
||||
|
||||
|
||||
def validate_clockwise_points(points):
|
||||
"""
|
||||
Validates that the points that the 4 points that dlimite a polygon are in clockwise order.
|
||||
"""
|
||||
|
||||
if len(points) != 8:
|
||||
raise Exception("Points list not valid." + str(len(points)))
|
||||
|
||||
point = [
|
||||
[int(points[0]), int(points[1])],
|
||||
[int(points[2]), int(points[3])],
|
||||
[int(points[4]), int(points[5])],
|
||||
[int(points[6]), int(points[7])],
|
||||
]
|
||||
edge = [
|
||||
(point[1][0] - point[0][0]) * (point[1][1] + point[0][1]),
|
||||
(point[2][0] - point[1][0]) * (point[2][1] + point[1][1]),
|
||||
(point[3][0] - point[2][0]) * (point[3][1] + point[2][1]),
|
||||
(point[0][0] - point[3][0]) * (point[0][1] + point[3][1]),
|
||||
]
|
||||
|
||||
summatory = edge[0] + edge[1] + edge[2] + edge[3]
|
||||
if summatory > 0:
|
||||
raise Exception(
|
||||
"Points are not clockwise. The coordinates of bounding quadrilaterals have to be given in clockwise order. Regarding the correct interpretation of 'clockwise' remember that the image coordinate system used is the standard one, with the image origin at the upper left, the X axis extending to the right and Y axis extending downwards."
|
||||
)
|
||||
|
||||
|
||||
def get_tl_line_values_from_file_contents(
|
||||
content,
|
||||
CRLF=True,
|
||||
LTRB=True,
|
||||
withTranscription=False,
|
||||
withConfidence=False,
|
||||
imWidth=0,
|
||||
imHeight=0,
|
||||
sort_by_confidences=True,
|
||||
):
|
||||
"""
|
||||
Returns all points, confindences and transcriptions of a file in lists. Valid line formats:
|
||||
xmin,ymin,xmax,ymax,[confidence],[transcription]
|
||||
x1,y1,x2,y2,x3,y3,x4,y4,[confidence],[transcription]
|
||||
"""
|
||||
pointsList = []
|
||||
transcriptionsList = []
|
||||
confidencesList = []
|
||||
|
||||
lines = content.split("\r\n" if CRLF else "\n")
|
||||
for line in lines:
|
||||
line = line.replace("\r", "").replace("\n", "")
|
||||
if line != "":
|
||||
points, confidence, transcription = get_tl_line_values(
|
||||
line, LTRB, withTranscription, withConfidence, imWidth, imHeight
|
||||
)
|
||||
pointsList.append(points)
|
||||
transcriptionsList.append(transcription)
|
||||
confidencesList.append(confidence)
|
||||
|
||||
if withConfidence and len(confidencesList) > 0 and sort_by_confidences:
|
||||
import numpy as np
|
||||
|
||||
sorted_ind = np.argsort(-np.array(confidencesList))
|
||||
confidencesList = [confidencesList[i] for i in sorted_ind]
|
||||
pointsList = [pointsList[i] for i in sorted_ind]
|
||||
transcriptionsList = [transcriptionsList[i] for i in sorted_ind]
|
||||
|
||||
return pointsList, confidencesList, transcriptionsList
|
||||
|
||||
|
||||
def main_evaluation(
|
||||
p,
|
||||
default_evaluation_params_fn,
|
||||
validate_data_fn,
|
||||
evaluate_method_fn,
|
||||
show_result=True,
|
||||
per_sample=True,
|
||||
):
|
||||
"""
|
||||
This process validates a method, evaluates it and if it succeed generates a ZIP file with a JSON entry for each sample.
|
||||
Params:
|
||||
p: Dictionary of parameters with the GT/submission locations. If None is passed, the parameters send by the system are used.
|
||||
default_evaluation_params_fn: points to a function that returns a dictionary with the default parameters used for the evaluation
|
||||
validate_data_fn: points to a method that validates the correct format of the submission
|
||||
evaluate_method_fn: points to a function that evaluated the submission and return a Dictionary with the results
|
||||
"""
|
||||
evalParams = default_evaluation_params_fn()
|
||||
if "p" in p.keys():
|
||||
evalParams.update(
|
||||
p["p"] if isinstance(p["p"], dict) else json.loads(p["p"][1:-1])
|
||||
)
|
||||
|
||||
resDict = {"calculated": True, "Message": "", "method": "{}", "per_sample": "{}"}
|
||||
try:
|
||||
# validate_data_fn(p['g'], p['s'], evalParams)
|
||||
evalData = evaluate_method_fn(p["g"], p["s"], evalParams)
|
||||
resDict.update(evalData)
|
||||
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
resDict["Message"] = str(e)
|
||||
resDict["calculated"] = False
|
||||
|
||||
if "o" in p:
|
||||
if not os.path.exists(p["o"]):
|
||||
os.makedirs(p["o"])
|
||||
|
||||
resultsOutputname = p["o"] + "/results.zip"
|
||||
outZip = zipfile.ZipFile(resultsOutputname, mode="w", allowZip64=True)
|
||||
|
||||
del resDict["per_sample"]
|
||||
if "output_items" in resDict.keys():
|
||||
del resDict["output_items"]
|
||||
|
||||
outZip.writestr("method.json", json.dumps(resDict))
|
||||
|
||||
if not resDict["calculated"]:
|
||||
if show_result:
|
||||
sys.stderr.write("Error!\n" + resDict["Message"] + "\n\n")
|
||||
if "o" in p:
|
||||
outZip.close()
|
||||
return resDict
|
||||
|
||||
if "o" in p:
|
||||
if per_sample == True:
|
||||
for k, v in evalData["per_sample"].iteritems():
|
||||
outZip.writestr(k + ".json", json.dumps(v))
|
||||
|
||||
if "output_items" in evalData.keys():
|
||||
for k, v in evalData["output_items"].iteritems():
|
||||
outZip.writestr(k, v)
|
||||
|
||||
outZip.close()
|
||||
|
||||
if show_result:
|
||||
sys.stdout.write("Calculated!")
|
||||
sys.stdout.write(json.dumps(resDict["method"]))
|
||||
|
||||
return resDict
|
||||
|
||||
|
||||
def main_validation(default_evaluation_params_fn, validate_data_fn):
|
||||
"""
|
||||
This process validates a method
|
||||
Params:
|
||||
default_evaluation_params_fn: points to a function that returns a dictionary with the default parameters used for the evaluation
|
||||
validate_data_fn: points to a method that validates the correct format of the submission
|
||||
"""
|
||||
try:
|
||||
p = dict([s[1:].split("=") for s in sys.argv[1:]])
|
||||
evalParams = default_evaluation_params_fn()
|
||||
if "p" in p.keys():
|
||||
evalParams.update(
|
||||
p["p"] if isinstance(p["p"], dict) else json.loads(p["p"][1:-1])
|
||||
)
|
||||
|
||||
validate_data_fn(p["g"], p["s"], evalParams)
|
||||
print("SUCCESS")
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
sys.exit(101)
|
||||
@ -0,0 +1,402 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import namedtuple
|
||||
from . import rrc_evaluation_funcs
|
||||
import Polygon as plg
|
||||
import numpy as np
|
||||
|
||||
|
||||
def default_evaluation_params():
|
||||
"""
|
||||
default_evaluation_params: Default parameters to use for the validation and evaluation.
|
||||
"""
|
||||
return {
|
||||
"IOU_CONSTRAINT": 0.5,
|
||||
"AREA_PRECISION_CONSTRAINT": 0.5,
|
||||
"GT_SAMPLE_NAME_2_ID": "gt_img_([0-9]+).txt",
|
||||
"DET_SAMPLE_NAME_2_ID": "res_img_([0-9]+).txt",
|
||||
"LTRB": False, # LTRB:2points(left,top,right,bottom) or 4 points(x1,y1,x2,y2,x3,y3,x4,y4)
|
||||
"CRLF": False, # Lines are delimited by Windows CRLF format
|
||||
"CONFIDENCES": False, # Detections must include confidence value. AP will be calculated
|
||||
"PER_SAMPLE_RESULTS": True, # Generate per sample results and produce data for visualization
|
||||
}
|
||||
|
||||
|
||||
def validate_data(gtFilePath, submFilePath, evaluationParams):
|
||||
"""
|
||||
Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
|
||||
Validates also that there are no missing files in the folder.
|
||||
If some error detected, the method raises the error
|
||||
"""
|
||||
gt = rrc_evaluation_funcs.load_folder_file(
|
||||
gtFilePath, evaluationParams["GT_SAMPLE_NAME_2_ID"]
|
||||
)
|
||||
|
||||
subm = rrc_evaluation_funcs.load_folder_file(
|
||||
submFilePath, evaluationParams["DET_SAMPLE_NAME_2_ID"], True
|
||||
)
|
||||
|
||||
# Validate format of GroundTruth
|
||||
for k in gt:
|
||||
rrc_evaluation_funcs.validate_lines_in_file(
|
||||
k, gt[k], evaluationParams["CRLF"], evaluationParams["LTRB"], True
|
||||
)
|
||||
|
||||
# Validate format of results
|
||||
for k in subm:
|
||||
if (k in gt) == False:
|
||||
raise Exception("The sample %s not present in GT" % k)
|
||||
|
||||
rrc_evaluation_funcs.validate_lines_in_file(
|
||||
k,
|
||||
subm[k],
|
||||
evaluationParams["CRLF"],
|
||||
evaluationParams["LTRB"],
|
||||
False,
|
||||
evaluationParams["CONFIDENCES"],
|
||||
)
|
||||
|
||||
|
||||
def evaluate_method(gtFilePath, submFilePath, evaluationParams):
|
||||
"""
|
||||
Method evaluate_method: evaluate method and returns the results
|
||||
Results. Dictionary with the following values:
|
||||
- method (required) Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
|
||||
- samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
|
||||
"""
|
||||
|
||||
def polygon_from_points(points):
|
||||
"""
|
||||
Returns a Polygon object to use with the Polygon2 class from a list of 8 points: x1,y1,x2,y2,x3,y3,x4,y4
|
||||
"""
|
||||
resBoxes = np.empty([1, 8], dtype="int32")
|
||||
resBoxes[0, 0] = int(points[0])
|
||||
resBoxes[0, 4] = int(points[1])
|
||||
resBoxes[0, 1] = int(points[2])
|
||||
resBoxes[0, 5] = int(points[3])
|
||||
resBoxes[0, 2] = int(points[4])
|
||||
resBoxes[0, 6] = int(points[5])
|
||||
resBoxes[0, 3] = int(points[6])
|
||||
resBoxes[0, 7] = int(points[7])
|
||||
pointMat = resBoxes[0].reshape([2, 4]).T
|
||||
return plg.Polygon(pointMat)
|
||||
|
||||
def rectangle_to_polygon(rect):
|
||||
resBoxes = np.empty([1, 8], dtype="int32")
|
||||
resBoxes[0, 0] = int(rect.xmin)
|
||||
resBoxes[0, 4] = int(rect.ymax)
|
||||
resBoxes[0, 1] = int(rect.xmin)
|
||||
resBoxes[0, 5] = int(rect.ymin)
|
||||
resBoxes[0, 2] = int(rect.xmax)
|
||||
resBoxes[0, 6] = int(rect.ymin)
|
||||
resBoxes[0, 3] = int(rect.xmax)
|
||||
resBoxes[0, 7] = int(rect.ymax)
|
||||
|
||||
pointMat = resBoxes[0].reshape([2, 4]).T
|
||||
|
||||
return plg.Polygon(pointMat)
|
||||
|
||||
def rectangle_to_points(rect):
|
||||
points = [
|
||||
int(rect.xmin),
|
||||
int(rect.ymax),
|
||||
int(rect.xmax),
|
||||
int(rect.ymax),
|
||||
int(rect.xmax),
|
||||
int(rect.ymin),
|
||||
int(rect.xmin),
|
||||
int(rect.ymin),
|
||||
]
|
||||
return points
|
||||
|
||||
def get_union(pD, pG):
|
||||
areaA = pD.area()
|
||||
areaB = pG.area()
|
||||
return areaA + areaB - get_intersection(pD, pG)
|
||||
|
||||
def get_intersection_over_union(pD, pG):
|
||||
try:
|
||||
return get_intersection(pD, pG) / get_union(pD, pG)
|
||||
except:
|
||||
return 0
|
||||
|
||||
def get_intersection(pD, pG):
|
||||
pInt = pD & pG
|
||||
if len(pInt) == 0:
|
||||
return 0
|
||||
return pInt.area()
|
||||
|
||||
def compute_ap(confList, matchList, numGtCare):
|
||||
correct = 0
|
||||
AP = 0
|
||||
if len(confList) > 0:
|
||||
confList = np.array(confList)
|
||||
matchList = np.array(matchList)
|
||||
sorted_ind = np.argsort(-confList)
|
||||
confList = confList[sorted_ind]
|
||||
matchList = matchList[sorted_ind]
|
||||
for n in range(len(confList)):
|
||||
match = matchList[n]
|
||||
if match:
|
||||
correct += 1
|
||||
AP += float(correct) / (n + 1)
|
||||
|
||||
if numGtCare > 0:
|
||||
AP /= numGtCare
|
||||
|
||||
return AP
|
||||
|
||||
perSampleMetrics = {}
|
||||
|
||||
matchedSum = 0
|
||||
|
||||
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
|
||||
|
||||
gt = rrc_evaluation_funcs.load_folder_file(
|
||||
gtFilePath, evaluationParams["GT_SAMPLE_NAME_2_ID"]
|
||||
)
|
||||
subm = rrc_evaluation_funcs.load_folder_file(
|
||||
submFilePath, evaluationParams["DET_SAMPLE_NAME_2_ID"], True
|
||||
)
|
||||
|
||||
numGlobalCareGt = 0
|
||||
numGlobalCareDet = 0
|
||||
|
||||
arrGlobalConfidences = []
|
||||
arrGlobalMatches = []
|
||||
|
||||
for resFile in gt:
|
||||
gtFile = gt[resFile] # rrc_evaluation_funcs.decode_utf8(gt[resFile])
|
||||
recall = 0
|
||||
precision = 0
|
||||
hmean = 0
|
||||
|
||||
detMatched = 0
|
||||
|
||||
iouMat = np.empty([1, 1])
|
||||
|
||||
gtPols = []
|
||||
detPols = []
|
||||
|
||||
gtPolPoints = []
|
||||
detPolPoints = []
|
||||
|
||||
# Array of Ground Truth Polygons' keys marked as don't Care
|
||||
gtDontCarePolsNum = []
|
||||
# Array of Detected Polygons' matched with a don't Care GT
|
||||
detDontCarePolsNum = []
|
||||
|
||||
pairs = []
|
||||
detMatchedNums = []
|
||||
|
||||
arrSampleConfidences = []
|
||||
arrSampleMatch = []
|
||||
sampleAP = 0
|
||||
|
||||
evaluationLog = ""
|
||||
|
||||
(
|
||||
pointsList,
|
||||
_,
|
||||
transcriptionsList,
|
||||
) = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
|
||||
gtFile, evaluationParams["CRLF"], evaluationParams["LTRB"], True, False
|
||||
)
|
||||
for n in range(len(pointsList)):
|
||||
points = pointsList[n]
|
||||
transcription = transcriptionsList[n]
|
||||
dontCare = transcription == "###"
|
||||
if evaluationParams["LTRB"]:
|
||||
gtRect = Rectangle(*points)
|
||||
gtPol = rectangle_to_polygon(gtRect)
|
||||
else:
|
||||
gtPol = polygon_from_points(points)
|
||||
gtPols.append(gtPol)
|
||||
gtPolPoints.append(points)
|
||||
if dontCare:
|
||||
gtDontCarePolsNum.append(len(gtPols) - 1)
|
||||
|
||||
evaluationLog += (
|
||||
"GT polygons: "
|
||||
+ str(len(gtPols))
|
||||
+ (
|
||||
" (" + str(len(gtDontCarePolsNum)) + " don't care)\n"
|
||||
if len(gtDontCarePolsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
if resFile in subm:
|
||||
detFile = subm[resFile] # rrc_evaluation_funcs.decode_utf8(subm[resFile])
|
||||
|
||||
(
|
||||
pointsList,
|
||||
confidencesList,
|
||||
_,
|
||||
) = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
|
||||
detFile,
|
||||
evaluationParams["CRLF"],
|
||||
evaluationParams["LTRB"],
|
||||
False,
|
||||
evaluationParams["CONFIDENCES"],
|
||||
)
|
||||
for n in range(len(pointsList)):
|
||||
points = pointsList[n]
|
||||
|
||||
if evaluationParams["LTRB"]:
|
||||
detRect = Rectangle(*points)
|
||||
detPol = rectangle_to_polygon(detRect)
|
||||
else:
|
||||
detPol = polygon_from_points(points)
|
||||
detPols.append(detPol)
|
||||
detPolPoints.append(points)
|
||||
if len(gtDontCarePolsNum) > 0:
|
||||
for dontCarePol in gtDontCarePolsNum:
|
||||
dontCarePol = gtPols[dontCarePol]
|
||||
intersected_area = get_intersection(dontCarePol, detPol)
|
||||
pdDimensions = detPol.area()
|
||||
precision = (
|
||||
0 if pdDimensions == 0 else intersected_area / pdDimensions
|
||||
)
|
||||
if precision > evaluationParams["AREA_PRECISION_CONSTRAINT"]:
|
||||
detDontCarePolsNum.append(len(detPols) - 1)
|
||||
break
|
||||
|
||||
evaluationLog += (
|
||||
"DET polygons: "
|
||||
+ str(len(detPols))
|
||||
+ (
|
||||
" (" + str(len(detDontCarePolsNum)) + " don't care)\n"
|
||||
if len(detDontCarePolsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
if len(gtPols) > 0 and len(detPols) > 0:
|
||||
# Calculate IoU and precision matrixs
|
||||
outputShape = [len(gtPols), len(detPols)]
|
||||
iouMat = np.empty(outputShape)
|
||||
gtRectMat = np.zeros(len(gtPols), np.int8)
|
||||
detRectMat = np.zeros(len(detPols), np.int8)
|
||||
for gtNum in range(len(gtPols)):
|
||||
for detNum in range(len(detPols)):
|
||||
pG = gtPols[gtNum]
|
||||
pD = detPols[detNum]
|
||||
iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
|
||||
|
||||
for gtNum in range(len(gtPols)):
|
||||
for detNum in range(len(detPols)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCarePolsNum
|
||||
and detNum not in detDontCarePolsNum
|
||||
):
|
||||
if (
|
||||
iouMat[gtNum, detNum]
|
||||
> evaluationParams["IOU_CONSTRAINT"]
|
||||
):
|
||||
gtRectMat[gtNum] = 1
|
||||
detRectMat[detNum] = 1
|
||||
detMatched += 1
|
||||
pairs.append({"gt": gtNum, "det": detNum})
|
||||
detMatchedNums.append(detNum)
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
if evaluationParams["CONFIDENCES"]:
|
||||
for detNum in range(len(detPols)):
|
||||
if detNum not in detDontCarePolsNum:
|
||||
# we exclude the don't care detections
|
||||
match = detNum in detMatchedNums
|
||||
|
||||
arrSampleConfidences.append(confidencesList[detNum])
|
||||
arrSampleMatch.append(match)
|
||||
|
||||
arrGlobalConfidences.append(confidencesList[detNum])
|
||||
arrGlobalMatches.append(match)
|
||||
|
||||
numGtCare = len(gtPols) - len(gtDontCarePolsNum)
|
||||
numDetCare = len(detPols) - len(detDontCarePolsNum)
|
||||
if numGtCare == 0:
|
||||
recall = float(1)
|
||||
precision = float(0) if numDetCare > 0 else float(1)
|
||||
sampleAP = precision
|
||||
else:
|
||||
recall = float(detMatched) / numGtCare
|
||||
precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare
|
||||
if (
|
||||
evaluationParams["CONFIDENCES"]
|
||||
and evaluationParams["PER_SAMPLE_RESULTS"]
|
||||
):
|
||||
sampleAP = compute_ap(arrSampleConfidences, arrSampleMatch, numGtCare)
|
||||
|
||||
hmean = (
|
||||
0
|
||||
if (precision + recall) == 0
|
||||
else 2.0 * precision * recall / (precision + recall)
|
||||
)
|
||||
|
||||
matchedSum += detMatched
|
||||
numGlobalCareGt += numGtCare
|
||||
numGlobalCareDet += numDetCare
|
||||
|
||||
if evaluationParams["PER_SAMPLE_RESULTS"]:
|
||||
perSampleMetrics[resFile] = {
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"hmean": hmean,
|
||||
"pairs": pairs,
|
||||
"AP": sampleAP,
|
||||
"iouMat": [] if len(detPols) > 100 else iouMat.tolist(),
|
||||
"gtPolPoints": gtPolPoints,
|
||||
"detPolPoints": detPolPoints,
|
||||
"gtDontCare": gtDontCarePolsNum,
|
||||
"detDontCare": detDontCarePolsNum,
|
||||
"evaluationParams": evaluationParams,
|
||||
"evaluationLog": evaluationLog,
|
||||
}
|
||||
|
||||
# Compute MAP and MAR
|
||||
AP = 0
|
||||
if evaluationParams["CONFIDENCES"]:
|
||||
AP = compute_ap(arrGlobalConfidences, arrGlobalMatches, numGlobalCareGt)
|
||||
|
||||
methodRecall = 0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt
|
||||
methodPrecision = (
|
||||
0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet
|
||||
)
|
||||
methodHmean = (
|
||||
0
|
||||
if methodRecall + methodPrecision == 0
|
||||
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
|
||||
)
|
||||
|
||||
methodMetrics = {
|
||||
"precision": methodPrecision,
|
||||
"recall": methodRecall,
|
||||
"hmean": methodHmean,
|
||||
"AP": AP,
|
||||
}
|
||||
|
||||
resDict = {
|
||||
"calculated": True,
|
||||
"Message": "",
|
||||
"method": methodMetrics,
|
||||
"per_sample": perSampleMetrics,
|
||||
}
|
||||
|
||||
return resDict
|
||||
|
||||
|
||||
def cal_recall_precision_f1(gt_path, result_path, show_result=False):
|
||||
p = {"g": gt_path, "s": result_path}
|
||||
result = rrc_evaluation_funcs.main_evaluation(
|
||||
p, default_evaluation_params, validate_data, evaluate_method, show_result
|
||||
)
|
||||
return result["method"]
|
||||
@ -0,0 +1,47 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/7 14:46
|
||||
# @Author : zhoujun
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
import os
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
|
||||
# calculate means and std
|
||||
train_txt_path = "./train_val_list.txt"
|
||||
|
||||
CNum = 10000 # 挑选多少图片进行计算
|
||||
|
||||
img_h, img_w = 640, 640
|
||||
imgs = np.zeros([img_w, img_h, 3, 1])
|
||||
means, stdevs = [], []
|
||||
|
||||
with open(train_txt_path, "r") as f:
|
||||
lines = f.readlines()
|
||||
random.shuffle(lines) # shuffle , 随机挑选图片
|
||||
|
||||
for i in tqdm(range(CNum)):
|
||||
img_path = lines[i].split("\t")[0]
|
||||
|
||||
img = cv2.imread(img_path)
|
||||
img = cv2.resize(img, (img_h, img_w))
|
||||
img = img[:, :, :, np.newaxis]
|
||||
|
||||
imgs = np.concatenate((imgs, img), axis=3)
|
||||
# print(i)
|
||||
|
||||
imgs = imgs.astype(np.float32) / 255.0
|
||||
|
||||
for i in tqdm(range(3)):
|
||||
pixels = imgs[:, :, i, :].ravel() # 拉成一行
|
||||
means.append(np.mean(pixels))
|
||||
stdevs.append(np.std(pixels))
|
||||
|
||||
# cv2 读取的图像格式为BGR,PIL/Skimage读取到的都是RGB不用转
|
||||
means.reverse() # BGR --> RGB
|
||||
stdevs.reverse()
|
||||
|
||||
print("normMean = {}".format(means))
|
||||
print("normStd = {}".format(stdevs))
|
||||
print("transforms.Normalize(normMean = {}, normStd = {})".format(means, stdevs))
|
||||
@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/24 12:06
|
||||
# @Author : zhoujun
|
||||
import os
|
||||
import glob
|
||||
import pathlib
|
||||
|
||||
data_path = r"test"
|
||||
# data_path/img 存放图片
|
||||
# data_path/gt 存放标签文件
|
||||
|
||||
f_w = open(os.path.join(data_path, "test.txt"), "w", encoding="utf8")
|
||||
for img_path in glob.glob(data_path + "/img/*.jpg", recursive=True):
|
||||
d = pathlib.Path(img_path)
|
||||
label_path = os.path.join(data_path, "gt", ("gt_" + str(d.stem) + ".txt"))
|
||||
if os.path.exists(img_path) and os.path.exists(label_path):
|
||||
print(img_path, label_path)
|
||||
else:
|
||||
print("不存在", img_path, label_path)
|
||||
f_w.write("{}\t{}\n".format(img_path, label_path))
|
||||
f_w.close()
|
||||
60
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/utils/metrics.py
Normal file
60
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/utils/metrics.py
Normal file
@ -0,0 +1,60 @@
|
||||
# Adapted from score written by wkentaro
|
||||
# https://github.com/wkentaro/pytorch-fcn/blob/master/torchfcn/utils.py
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class runningScore(object):
|
||||
def __init__(self, n_classes):
|
||||
self.n_classes = n_classes
|
||||
self.confusion_matrix = np.zeros((n_classes, n_classes))
|
||||
|
||||
def _fast_hist(self, label_true, label_pred, n_class):
|
||||
mask = (label_true >= 0) & (label_true < n_class)
|
||||
|
||||
if np.sum((label_pred[mask] < 0)) > 0:
|
||||
print(label_pred[label_pred < 0])
|
||||
hist = np.bincount(
|
||||
n_class * label_true[mask].astype(int) + label_pred[mask],
|
||||
minlength=n_class**2,
|
||||
).reshape(n_class, n_class)
|
||||
return hist
|
||||
|
||||
def update(self, label_trues, label_preds):
|
||||
# print label_trues.dtype, label_preds.dtype
|
||||
for lt, lp in zip(label_trues, label_preds):
|
||||
try:
|
||||
self.confusion_matrix += self._fast_hist(
|
||||
lt.flatten(), lp.flatten(), self.n_classes
|
||||
)
|
||||
except:
|
||||
pass
|
||||
|
||||
def get_scores(self):
|
||||
"""Returns accuracy score evaluation result.
|
||||
- overall accuracy
|
||||
- mean accuracy
|
||||
- mean IU
|
||||
- fwavacc
|
||||
"""
|
||||
hist = self.confusion_matrix
|
||||
acc = np.diag(hist).sum() / (hist.sum() + 0.0001)
|
||||
acc_cls = np.diag(hist) / (hist.sum(axis=1) + 0.0001)
|
||||
acc_cls = np.nanmean(acc_cls)
|
||||
iu = np.diag(hist) / (
|
||||
hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist) + 0.0001
|
||||
)
|
||||
mean_iu = np.nanmean(iu)
|
||||
freq = hist.sum(axis=1) / (hist.sum() + 0.0001)
|
||||
fwavacc = (freq[freq > 0] * iu[freq > 0]).sum()
|
||||
cls_iu = dict(zip(range(self.n_classes), iu))
|
||||
|
||||
return {
|
||||
"Overall Acc": acc,
|
||||
"Mean Acc": acc_cls,
|
||||
"FreqW Acc": fwavacc,
|
||||
"Mean IoU": mean_iu,
|
||||
}, cls_iu
|
||||
|
||||
def reset(self):
|
||||
self.confusion_matrix = np.zeros((self.n_classes, self.n_classes))
|
||||
@ -0,0 +1,19 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/5 15:36
|
||||
# @Author : zhoujun
|
||||
from .icdar2015 import QuadMetric
|
||||
|
||||
|
||||
def get_metric(config):
|
||||
try:
|
||||
if "args" not in config:
|
||||
args = {}
|
||||
else:
|
||||
args = config["args"]
|
||||
if isinstance(args, dict):
|
||||
cls = eval(config["type"])(**args)
|
||||
else:
|
||||
cls = eval(config["type"])(args)
|
||||
return cls
|
||||
except:
|
||||
return None
|
||||
@ -0,0 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/12/5 15:36
|
||||
# @Author : zhoujun
|
||||
|
||||
from .quad_metric import QuadMetric
|
||||
@ -0,0 +1,474 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
import math
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
|
||||
class DetectionDetEvalEvaluator(object):
|
||||
def __init__(
|
||||
self,
|
||||
area_recall_constraint=0.8,
|
||||
area_precision_constraint=0.4,
|
||||
ev_param_ind_center_diff_thr=1,
|
||||
mtype_oo_o=1.0,
|
||||
mtype_om_o=0.8,
|
||||
mtype_om_m=1.0,
|
||||
):
|
||||
self.area_recall_constraint = area_recall_constraint
|
||||
self.area_precision_constraint = area_precision_constraint
|
||||
self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr
|
||||
self.mtype_oo_o = mtype_oo_o
|
||||
self.mtype_om_o = mtype_om_o
|
||||
self.mtype_om_m = mtype_om_m
|
||||
|
||||
def evaluate_image(self, gt, pred):
|
||||
def get_union(pD, pG):
|
||||
return Polygon(pD).union(Polygon(pG)).area
|
||||
|
||||
def get_intersection_over_union(pD, pG):
|
||||
return get_intersection(pD, pG) / get_union(pD, pG)
|
||||
|
||||
def get_intersection(pD, pG):
|
||||
return Polygon(pD).intersection(Polygon(pG)).area
|
||||
|
||||
def one_to_one_match(row, col):
|
||||
cont = 0
|
||||
for j in range(len(recallMat[0])):
|
||||
if (
|
||||
recallMat[row, j] >= self.area_recall_constraint
|
||||
and precisionMat[row, j] >= self.area_precision_constraint
|
||||
):
|
||||
cont = cont + 1
|
||||
if cont != 1:
|
||||
return False
|
||||
cont = 0
|
||||
for i in range(len(recallMat)):
|
||||
if (
|
||||
recallMat[i, col] >= self.area_recall_constraint
|
||||
and precisionMat[i, col] >= self.area_precision_constraint
|
||||
):
|
||||
cont = cont + 1
|
||||
if cont != 1:
|
||||
return False
|
||||
|
||||
if (
|
||||
recallMat[row, col] >= self.area_recall_constraint
|
||||
and precisionMat[row, col] >= self.area_precision_constraint
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def num_overlaps_gt(gtNum):
|
||||
cont = 0
|
||||
for detNum in range(len(detRects)):
|
||||
if detNum not in detDontCareRectsNum:
|
||||
if recallMat[gtNum, detNum] > 0:
|
||||
cont = cont + 1
|
||||
return cont
|
||||
|
||||
def num_overlaps_det(detNum):
|
||||
cont = 0
|
||||
for gtNum in range(len(recallMat)):
|
||||
if gtNum not in gtDontCareRectsNum:
|
||||
if recallMat[gtNum, detNum] > 0:
|
||||
cont = cont + 1
|
||||
return cont
|
||||
|
||||
def is_single_overlap(row, col):
|
||||
if num_overlaps_gt(row) == 1 and num_overlaps_det(col) == 1:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def one_to_many_match(gtNum):
|
||||
many_sum = 0
|
||||
detRects = []
|
||||
for detNum in range(len(recallMat[0])):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and detNum not in detDontCareRectsNum
|
||||
):
|
||||
if precisionMat[gtNum, detNum] >= self.area_precision_constraint:
|
||||
many_sum += recallMat[gtNum, detNum]
|
||||
detRects.append(detNum)
|
||||
if round(many_sum, 4) >= self.area_recall_constraint:
|
||||
return True, detRects
|
||||
else:
|
||||
return False, []
|
||||
|
||||
def many_to_one_match(detNum):
|
||||
many_sum = 0
|
||||
gtRects = []
|
||||
for gtNum in range(len(recallMat)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCareRectsNum
|
||||
):
|
||||
if recallMat[gtNum, detNum] >= self.area_recall_constraint:
|
||||
many_sum += precisionMat[gtNum, detNum]
|
||||
gtRects.append(gtNum)
|
||||
if round(many_sum, 4) >= self.area_precision_constraint:
|
||||
return True, gtRects
|
||||
else:
|
||||
return False, []
|
||||
|
||||
def center_distance(r1, r2):
|
||||
return ((np.mean(r1, axis=0) - np.mean(r2, axis=0)) ** 2).sum() ** 0.5
|
||||
|
||||
def diag(r):
|
||||
r = np.array(r)
|
||||
return (
|
||||
(r[:, 0].max() - r[:, 0].min()) ** 2
|
||||
+ (r[:, 1].max() - r[:, 1].min()) ** 2
|
||||
) ** 0.5
|
||||
|
||||
perSampleMetrics = {}
|
||||
|
||||
recall = 0
|
||||
precision = 0
|
||||
hmean = 0
|
||||
recallAccum = 0.0
|
||||
precisionAccum = 0.0
|
||||
gtRects = []
|
||||
detRects = []
|
||||
gtPolPoints = []
|
||||
detPolPoints = []
|
||||
gtDontCareRectsNum = (
|
||||
[]
|
||||
) # Array of Ground Truth Rectangles' keys marked as don't Care
|
||||
detDontCareRectsNum = (
|
||||
[]
|
||||
) # Array of Detected Rectangles' matched with a don't Care GT
|
||||
pairs = []
|
||||
evaluationLog = ""
|
||||
|
||||
recallMat = np.empty([1, 1])
|
||||
precisionMat = np.empty([1, 1])
|
||||
|
||||
for n in range(len(gt)):
|
||||
points = gt[n]["points"]
|
||||
# transcription = gt[n]['text']
|
||||
dontCare = gt[n]["ignore"]
|
||||
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
gtRects.append(points)
|
||||
gtPolPoints.append(points)
|
||||
if dontCare:
|
||||
gtDontCareRectsNum.append(len(gtRects) - 1)
|
||||
|
||||
evaluationLog += (
|
||||
"GT rectangles: "
|
||||
+ str(len(gtRects))
|
||||
+ (
|
||||
" (" + str(len(gtDontCareRectsNum)) + " don't care)\n"
|
||||
if len(gtDontCareRectsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
for n in range(len(pred)):
|
||||
points = pred[n]["points"]
|
||||
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
detRect = points
|
||||
detRects.append(detRect)
|
||||
detPolPoints.append(points)
|
||||
if len(gtDontCareRectsNum) > 0:
|
||||
for dontCareRectNum in gtDontCareRectsNum:
|
||||
dontCareRect = gtRects[dontCareRectNum]
|
||||
intersected_area = get_intersection(dontCareRect, detRect)
|
||||
rdDimensions = Polygon(detRect).area
|
||||
if rdDimensions == 0:
|
||||
precision = 0
|
||||
else:
|
||||
precision = intersected_area / rdDimensions
|
||||
if precision > self.area_precision_constraint:
|
||||
detDontCareRectsNum.append(len(detRects) - 1)
|
||||
break
|
||||
|
||||
evaluationLog += (
|
||||
"DET rectangles: "
|
||||
+ str(len(detRects))
|
||||
+ (
|
||||
" (" + str(len(detDontCareRectsNum)) + " don't care)\n"
|
||||
if len(detDontCareRectsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
if len(gtRects) == 0:
|
||||
recall = 1
|
||||
precision = 0 if len(detRects) > 0 else 1
|
||||
|
||||
if len(detRects) > 0:
|
||||
# Calculate recall and precision matrixes
|
||||
outputShape = [len(gtRects), len(detRects)]
|
||||
recallMat = np.empty(outputShape)
|
||||
precisionMat = np.empty(outputShape)
|
||||
gtRectMat = np.zeros(len(gtRects), np.int8)
|
||||
detRectMat = np.zeros(len(detRects), np.int8)
|
||||
for gtNum in range(len(gtRects)):
|
||||
for detNum in range(len(detRects)):
|
||||
rG = gtRects[gtNum]
|
||||
rD = detRects[detNum]
|
||||
intersected_area = get_intersection(rG, rD)
|
||||
rgDimensions = Polygon(rG).area
|
||||
rdDimensions = Polygon(rD).area
|
||||
recallMat[gtNum, detNum] = (
|
||||
0 if rgDimensions == 0 else intersected_area / rgDimensions
|
||||
)
|
||||
precisionMat[gtNum, detNum] = (
|
||||
0 if rdDimensions == 0 else intersected_area / rdDimensions
|
||||
)
|
||||
|
||||
# Find one-to-one matches
|
||||
evaluationLog += "Find one-to-one matches\n"
|
||||
for gtNum in range(len(gtRects)):
|
||||
for detNum in range(len(detRects)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCareRectsNum
|
||||
and detNum not in detDontCareRectsNum
|
||||
):
|
||||
match = one_to_one_match(gtNum, detNum)
|
||||
if match is True:
|
||||
# in deteval we have to make other validation before mark as one-to-one
|
||||
if is_single_overlap(gtNum, detNum) is True:
|
||||
rG = gtRects[gtNum]
|
||||
rD = detRects[detNum]
|
||||
normDist = center_distance(rG, rD)
|
||||
normDist /= diag(rG) + diag(rD)
|
||||
normDist *= 2.0
|
||||
if normDist < self.ev_param_ind_center_diff_thr:
|
||||
gtRectMat[gtNum] = 1
|
||||
detRectMat[detNum] = 1
|
||||
recallAccum += self.mtype_oo_o
|
||||
precisionAccum += self.mtype_oo_o
|
||||
pairs.append(
|
||||
{"gt": gtNum, "det": detNum, "type": "OO"}
|
||||
)
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
else:
|
||||
evaluationLog += (
|
||||
"Match Discarded GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ " normDist: "
|
||||
+ str(normDist)
|
||||
+ " \n"
|
||||
)
|
||||
else:
|
||||
evaluationLog += (
|
||||
"Match Discarded GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ " not single overlap\n"
|
||||
)
|
||||
# Find one-to-many matches
|
||||
evaluationLog += "Find one-to-many matches\n"
|
||||
for gtNum in range(len(gtRects)):
|
||||
if gtNum not in gtDontCareRectsNum:
|
||||
match, matchesDet = one_to_many_match(gtNum)
|
||||
if match is True:
|
||||
evaluationLog += "num_overlaps_gt=" + str(
|
||||
num_overlaps_gt(gtNum)
|
||||
)
|
||||
# in deteval we have to make other validation before mark as one-to-one
|
||||
if num_overlaps_gt(gtNum) >= 2:
|
||||
gtRectMat[gtNum] = 1
|
||||
recallAccum += (
|
||||
self.mtype_oo_o
|
||||
if len(matchesDet) == 1
|
||||
else self.mtype_om_o
|
||||
)
|
||||
precisionAccum += (
|
||||
self.mtype_oo_o
|
||||
if len(matchesDet) == 1
|
||||
else self.mtype_om_o * len(matchesDet)
|
||||
)
|
||||
pairs.append(
|
||||
{
|
||||
"gt": gtNum,
|
||||
"det": matchesDet,
|
||||
"type": "OO" if len(matchesDet) == 1 else "OM",
|
||||
}
|
||||
)
|
||||
for detNum in matchesDet:
|
||||
detRectMat[detNum] = 1
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(matchesDet)
|
||||
+ "\n"
|
||||
)
|
||||
else:
|
||||
evaluationLog += (
|
||||
"Match Discarded GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(matchesDet)
|
||||
+ " not single overlap\n"
|
||||
)
|
||||
|
||||
# Find many-to-one matches
|
||||
evaluationLog += "Find many-to-one matches\n"
|
||||
for detNum in range(len(detRects)):
|
||||
if detNum not in detDontCareRectsNum:
|
||||
match, matchesGt = many_to_one_match(detNum)
|
||||
if match is True:
|
||||
# in deteval we have to make other validation before mark as one-to-one
|
||||
if num_overlaps_det(detNum) >= 2:
|
||||
detRectMat[detNum] = 1
|
||||
recallAccum += (
|
||||
self.mtype_oo_o
|
||||
if len(matchesGt) == 1
|
||||
else self.mtype_om_m * len(matchesGt)
|
||||
)
|
||||
precisionAccum += (
|
||||
self.mtype_oo_o
|
||||
if len(matchesGt) == 1
|
||||
else self.mtype_om_m
|
||||
)
|
||||
pairs.append(
|
||||
{
|
||||
"gt": matchesGt,
|
||||
"det": detNum,
|
||||
"type": "OO" if len(matchesGt) == 1 else "MO",
|
||||
}
|
||||
)
|
||||
for gtNum in matchesGt:
|
||||
gtRectMat[gtNum] = 1
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(matchesGt)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
else:
|
||||
evaluationLog += (
|
||||
"Match Discarded GT #"
|
||||
+ str(matchesGt)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ " not single overlap\n"
|
||||
)
|
||||
|
||||
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
|
||||
if numGtCare == 0:
|
||||
recall = float(1)
|
||||
precision = float(0) if len(detRects) > 0 else float(1)
|
||||
else:
|
||||
recall = float(recallAccum) / numGtCare
|
||||
precision = (
|
||||
float(0)
|
||||
if (len(detRects) - len(detDontCareRectsNum)) == 0
|
||||
else float(precisionAccum)
|
||||
/ (len(detRects) - len(detDontCareRectsNum))
|
||||
)
|
||||
hmean = (
|
||||
0
|
||||
if (precision + recall) == 0
|
||||
else 2.0 * precision * recall / (precision + recall)
|
||||
)
|
||||
|
||||
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
|
||||
numDetCare = len(detRects) - len(detDontCareRectsNum)
|
||||
|
||||
perSampleMetrics = {
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"hmean": hmean,
|
||||
"pairs": pairs,
|
||||
"recallMat": [] if len(detRects) > 100 else recallMat.tolist(),
|
||||
"precisionMat": [] if len(detRects) > 100 else precisionMat.tolist(),
|
||||
"gtPolPoints": gtPolPoints,
|
||||
"detPolPoints": detPolPoints,
|
||||
"gtCare": numGtCare,
|
||||
"detCare": numDetCare,
|
||||
"gtDontCare": gtDontCareRectsNum,
|
||||
"detDontCare": detDontCareRectsNum,
|
||||
"recallAccum": recallAccum,
|
||||
"precisionAccum": precisionAccum,
|
||||
"evaluationLog": evaluationLog,
|
||||
}
|
||||
|
||||
return perSampleMetrics
|
||||
|
||||
def combine_results(self, results):
|
||||
numGt = 0
|
||||
numDet = 0
|
||||
methodRecallSum = 0
|
||||
methodPrecisionSum = 0
|
||||
|
||||
for result in results:
|
||||
numGt += result["gtCare"]
|
||||
numDet += result["detCare"]
|
||||
methodRecallSum += result["recallAccum"]
|
||||
methodPrecisionSum += result["precisionAccum"]
|
||||
|
||||
methodRecall = 0 if numGt == 0 else methodRecallSum / numGt
|
||||
methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet
|
||||
methodHmean = (
|
||||
0
|
||||
if methodRecall + methodPrecision == 0
|
||||
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
|
||||
)
|
||||
|
||||
methodMetrics = {
|
||||
"precision": methodPrecision,
|
||||
"recall": methodRecall,
|
||||
"hmean": methodHmean,
|
||||
}
|
||||
|
||||
return methodMetrics
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
evaluator = DetectionDetEvalEvaluator()
|
||||
gts = [
|
||||
[
|
||||
{
|
||||
"points": [(0, 0), (1, 0), (1, 1), (0, 1)],
|
||||
"text": 1234,
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"points": [(2, 2), (3, 2), (3, 3), (2, 3)],
|
||||
"text": 5678,
|
||||
"ignore": True,
|
||||
},
|
||||
]
|
||||
]
|
||||
preds = [
|
||||
[
|
||||
{
|
||||
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
|
||||
"text": 123,
|
||||
"ignore": False,
|
||||
}
|
||||
]
|
||||
]
|
||||
results = []
|
||||
for gt, pred in zip(gts, preds):
|
||||
results.append(evaluator.evaluate_image(gt, pred))
|
||||
metrics = evaluator.combine_results(results)
|
||||
print(metrics)
|
||||
@ -0,0 +1,417 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
import math
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
|
||||
class DetectionICDAR2013Evaluator(object):
|
||||
def __init__(
|
||||
self,
|
||||
area_recall_constraint=0.8,
|
||||
area_precision_constraint=0.4,
|
||||
ev_param_ind_center_diff_thr=1,
|
||||
mtype_oo_o=1.0,
|
||||
mtype_om_o=0.8,
|
||||
mtype_om_m=1.0,
|
||||
):
|
||||
self.area_recall_constraint = area_recall_constraint
|
||||
self.area_precision_constraint = area_precision_constraint
|
||||
self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr
|
||||
self.mtype_oo_o = mtype_oo_o
|
||||
self.mtype_om_o = mtype_om_o
|
||||
self.mtype_om_m = mtype_om_m
|
||||
|
||||
def evaluate_image(self, gt, pred):
|
||||
def get_union(pD, pG):
|
||||
return Polygon(pD).union(Polygon(pG)).area
|
||||
|
||||
def get_intersection_over_union(pD, pG):
|
||||
return get_intersection(pD, pG) / get_union(pD, pG)
|
||||
|
||||
def get_intersection(pD, pG):
|
||||
return Polygon(pD).intersection(Polygon(pG)).area
|
||||
|
||||
def one_to_one_match(row, col):
|
||||
cont = 0
|
||||
for j in range(len(recallMat[0])):
|
||||
if (
|
||||
recallMat[row, j] >= self.area_recall_constraint
|
||||
and precisionMat[row, j] >= self.area_precision_constraint
|
||||
):
|
||||
cont = cont + 1
|
||||
if cont != 1:
|
||||
return False
|
||||
cont = 0
|
||||
for i in range(len(recallMat)):
|
||||
if (
|
||||
recallMat[i, col] >= self.area_recall_constraint
|
||||
and precisionMat[i, col] >= self.area_precision_constraint
|
||||
):
|
||||
cont = cont + 1
|
||||
if cont != 1:
|
||||
return False
|
||||
|
||||
if (
|
||||
recallMat[row, col] >= self.area_recall_constraint
|
||||
and precisionMat[row, col] >= self.area_precision_constraint
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def one_to_many_match(gtNum):
|
||||
many_sum = 0
|
||||
detRects = []
|
||||
for detNum in range(len(recallMat[0])):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and detNum not in detDontCareRectsNum
|
||||
):
|
||||
if precisionMat[gtNum, detNum] >= self.area_precision_constraint:
|
||||
many_sum += recallMat[gtNum, detNum]
|
||||
detRects.append(detNum)
|
||||
if round(many_sum, 4) >= self.area_recall_constraint:
|
||||
return True, detRects
|
||||
else:
|
||||
return False, []
|
||||
|
||||
def many_to_one_match(detNum):
|
||||
many_sum = 0
|
||||
gtRects = []
|
||||
for gtNum in range(len(recallMat)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCareRectsNum
|
||||
):
|
||||
if recallMat[gtNum, detNum] >= self.area_recall_constraint:
|
||||
many_sum += precisionMat[gtNum, detNum]
|
||||
gtRects.append(gtNum)
|
||||
if round(many_sum, 4) >= self.area_precision_constraint:
|
||||
return True, gtRects
|
||||
else:
|
||||
return False, []
|
||||
|
||||
def center_distance(r1, r2):
|
||||
return ((np.mean(r1, axis=0) - np.mean(r2, axis=0)) ** 2).sum() ** 0.5
|
||||
|
||||
def diag(r):
|
||||
r = np.array(r)
|
||||
return (
|
||||
(r[:, 0].max() - r[:, 0].min()) ** 2
|
||||
+ (r[:, 1].max() - r[:, 1].min()) ** 2
|
||||
) ** 0.5
|
||||
|
||||
perSampleMetrics = {}
|
||||
|
||||
recall = 0
|
||||
precision = 0
|
||||
hmean = 0
|
||||
recallAccum = 0.0
|
||||
precisionAccum = 0.0
|
||||
gtRects = []
|
||||
detRects = []
|
||||
gtPolPoints = []
|
||||
detPolPoints = []
|
||||
gtDontCareRectsNum = (
|
||||
[]
|
||||
) # Array of Ground Truth Rectangles' keys marked as don't Care
|
||||
detDontCareRectsNum = (
|
||||
[]
|
||||
) # Array of Detected Rectangles' matched with a don't Care GT
|
||||
pairs = []
|
||||
evaluationLog = ""
|
||||
|
||||
recallMat = np.empty([1, 1])
|
||||
precisionMat = np.empty([1, 1])
|
||||
|
||||
for n in range(len(gt)):
|
||||
points = gt[n]["points"]
|
||||
# transcription = gt[n]['text']
|
||||
dontCare = gt[n]["ignore"]
|
||||
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
gtRects.append(points)
|
||||
gtPolPoints.append(points)
|
||||
if dontCare:
|
||||
gtDontCareRectsNum.append(len(gtRects) - 1)
|
||||
|
||||
evaluationLog += (
|
||||
"GT rectangles: "
|
||||
+ str(len(gtRects))
|
||||
+ (
|
||||
" (" + str(len(gtDontCareRectsNum)) + " don't care)\n"
|
||||
if len(gtDontCareRectsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
for n in range(len(pred)):
|
||||
points = pred[n]["points"]
|
||||
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
detRect = points
|
||||
detRects.append(detRect)
|
||||
detPolPoints.append(points)
|
||||
if len(gtDontCareRectsNum) > 0:
|
||||
for dontCareRectNum in gtDontCareRectsNum:
|
||||
dontCareRect = gtRects[dontCareRectNum]
|
||||
intersected_area = get_intersection(dontCareRect, detRect)
|
||||
rdDimensions = Polygon(detRect).area
|
||||
if rdDimensions == 0:
|
||||
precision = 0
|
||||
else:
|
||||
precision = intersected_area / rdDimensions
|
||||
if precision > self.area_precision_constraint:
|
||||
detDontCareRectsNum.append(len(detRects) - 1)
|
||||
break
|
||||
|
||||
evaluationLog += (
|
||||
"DET rectangles: "
|
||||
+ str(len(detRects))
|
||||
+ (
|
||||
" (" + str(len(detDontCareRectsNum)) + " don't care)\n"
|
||||
if len(detDontCareRectsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
if len(gtRects) == 0:
|
||||
recall = 1
|
||||
precision = 0 if len(detRects) > 0 else 1
|
||||
|
||||
if len(detRects) > 0:
|
||||
# Calculate recall and precision matrixes
|
||||
outputShape = [len(gtRects), len(detRects)]
|
||||
recallMat = np.empty(outputShape)
|
||||
precisionMat = np.empty(outputShape)
|
||||
gtRectMat = np.zeros(len(gtRects), np.int8)
|
||||
detRectMat = np.zeros(len(detRects), np.int8)
|
||||
for gtNum in range(len(gtRects)):
|
||||
for detNum in range(len(detRects)):
|
||||
rG = gtRects[gtNum]
|
||||
rD = detRects[detNum]
|
||||
intersected_area = get_intersection(rG, rD)
|
||||
rgDimensions = Polygon(rG).area
|
||||
rdDimensions = Polygon(rD).area
|
||||
recallMat[gtNum, detNum] = (
|
||||
0 if rgDimensions == 0 else intersected_area / rgDimensions
|
||||
)
|
||||
precisionMat[gtNum, detNum] = (
|
||||
0 if rdDimensions == 0 else intersected_area / rdDimensions
|
||||
)
|
||||
|
||||
# Find one-to-one matches
|
||||
evaluationLog += "Find one-to-one matches\n"
|
||||
for gtNum in range(len(gtRects)):
|
||||
for detNum in range(len(detRects)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCareRectsNum
|
||||
and detNum not in detDontCareRectsNum
|
||||
):
|
||||
match = one_to_one_match(gtNum, detNum)
|
||||
if match is True:
|
||||
# in deteval we have to make other validation before mark as one-to-one
|
||||
rG = gtRects[gtNum]
|
||||
rD = detRects[detNum]
|
||||
normDist = center_distance(rG, rD)
|
||||
normDist /= diag(rG) + diag(rD)
|
||||
normDist *= 2.0
|
||||
if normDist < self.ev_param_ind_center_diff_thr:
|
||||
gtRectMat[gtNum] = 1
|
||||
detRectMat[detNum] = 1
|
||||
recallAccum += self.mtype_oo_o
|
||||
precisionAccum += self.mtype_oo_o
|
||||
pairs.append({"gt": gtNum, "det": detNum, "type": "OO"})
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
else:
|
||||
evaluationLog += (
|
||||
"Match Discarded GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ " normDist: "
|
||||
+ str(normDist)
|
||||
+ " \n"
|
||||
)
|
||||
# Find one-to-many matches
|
||||
evaluationLog += "Find one-to-many matches\n"
|
||||
for gtNum in range(len(gtRects)):
|
||||
if gtNum not in gtDontCareRectsNum:
|
||||
match, matchesDet = one_to_many_match(gtNum)
|
||||
if match is True:
|
||||
evaluationLog += "num_overlaps_gt=" + str(
|
||||
num_overlaps_gt(gtNum)
|
||||
)
|
||||
gtRectMat[gtNum] = 1
|
||||
recallAccum += (
|
||||
self.mtype_oo_o if len(matchesDet) == 1 else self.mtype_om_o
|
||||
)
|
||||
precisionAccum += (
|
||||
self.mtype_oo_o
|
||||
if len(matchesDet) == 1
|
||||
else self.mtype_om_o * len(matchesDet)
|
||||
)
|
||||
pairs.append(
|
||||
{
|
||||
"gt": gtNum,
|
||||
"det": matchesDet,
|
||||
"type": "OO" if len(matchesDet) == 1 else "OM",
|
||||
}
|
||||
)
|
||||
for detNum in matchesDet:
|
||||
detRectMat[detNum] = 1
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(matchesDet)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
# Find many-to-one matches
|
||||
evaluationLog += "Find many-to-one matches\n"
|
||||
for detNum in range(len(detRects)):
|
||||
if detNum not in detDontCareRectsNum:
|
||||
match, matchesGt = many_to_one_match(detNum)
|
||||
if match is True:
|
||||
detRectMat[detNum] = 1
|
||||
recallAccum += (
|
||||
self.mtype_oo_o
|
||||
if len(matchesGt) == 1
|
||||
else self.mtype_om_m * len(matchesGt)
|
||||
)
|
||||
precisionAccum += (
|
||||
self.mtype_oo_o if len(matchesGt) == 1 else self.mtype_om_m
|
||||
)
|
||||
pairs.append(
|
||||
{
|
||||
"gt": matchesGt,
|
||||
"det": detNum,
|
||||
"type": "OO" if len(matchesGt) == 1 else "MO",
|
||||
}
|
||||
)
|
||||
for gtNum in matchesGt:
|
||||
gtRectMat[gtNum] = 1
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(matchesGt)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
|
||||
if numGtCare == 0:
|
||||
recall = float(1)
|
||||
precision = float(0) if len(detRects) > 0 else float(1)
|
||||
else:
|
||||
recall = float(recallAccum) / numGtCare
|
||||
precision = (
|
||||
float(0)
|
||||
if (len(detRects) - len(detDontCareRectsNum)) == 0
|
||||
else float(precisionAccum)
|
||||
/ (len(detRects) - len(detDontCareRectsNum))
|
||||
)
|
||||
hmean = (
|
||||
0
|
||||
if (precision + recall) == 0
|
||||
else 2.0 * precision * recall / (precision + recall)
|
||||
)
|
||||
|
||||
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
|
||||
numDetCare = len(detRects) - len(detDontCareRectsNum)
|
||||
|
||||
perSampleMetrics = {
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"hmean": hmean,
|
||||
"pairs": pairs,
|
||||
"recallMat": [] if len(detRects) > 100 else recallMat.tolist(),
|
||||
"precisionMat": [] if len(detRects) > 100 else precisionMat.tolist(),
|
||||
"gtPolPoints": gtPolPoints,
|
||||
"detPolPoints": detPolPoints,
|
||||
"gtCare": numGtCare,
|
||||
"detCare": numDetCare,
|
||||
"gtDontCare": gtDontCareRectsNum,
|
||||
"detDontCare": detDontCareRectsNum,
|
||||
"recallAccum": recallAccum,
|
||||
"precisionAccum": precisionAccum,
|
||||
"evaluationLog": evaluationLog,
|
||||
}
|
||||
|
||||
return perSampleMetrics
|
||||
|
||||
def combine_results(self, results):
|
||||
numGt = 0
|
||||
numDet = 0
|
||||
methodRecallSum = 0
|
||||
methodPrecisionSum = 0
|
||||
|
||||
for result in results:
|
||||
numGt += result["gtCare"]
|
||||
numDet += result["detCare"]
|
||||
methodRecallSum += result["recallAccum"]
|
||||
methodPrecisionSum += result["precisionAccum"]
|
||||
|
||||
methodRecall = 0 if numGt == 0 else methodRecallSum / numGt
|
||||
methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet
|
||||
methodHmean = (
|
||||
0
|
||||
if methodRecall + methodPrecision == 0
|
||||
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
|
||||
)
|
||||
|
||||
methodMetrics = {
|
||||
"precision": methodPrecision,
|
||||
"recall": methodRecall,
|
||||
"hmean": methodHmean,
|
||||
}
|
||||
|
||||
return methodMetrics
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
evaluator = DetectionICDAR2013Evaluator()
|
||||
gts = [
|
||||
[
|
||||
{
|
||||
"points": [(0, 0), (1, 0), (1, 1), (0, 1)],
|
||||
"text": 1234,
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"points": [(2, 2), (3, 2), (3, 3), (2, 3)],
|
||||
"text": 5678,
|
||||
"ignore": True,
|
||||
},
|
||||
]
|
||||
]
|
||||
preds = [
|
||||
[
|
||||
{
|
||||
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
|
||||
"text": 123,
|
||||
"ignore": False,
|
||||
}
|
||||
]
|
||||
]
|
||||
results = []
|
||||
for gt, pred in zip(gts, preds):
|
||||
results.append(evaluator.evaluate_image(gt, pred))
|
||||
metrics = evaluator.combine_results(results)
|
||||
print(metrics)
|
||||
@ -0,0 +1,300 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
from shapely.geometry import Polygon
|
||||
import cv2
|
||||
|
||||
|
||||
def iou_rotate(box_a, box_b, method="union"):
|
||||
rect_a = cv2.minAreaRect(box_a)
|
||||
rect_b = cv2.minAreaRect(box_b)
|
||||
r1 = cv2.rotatedRectangleIntersection(rect_a, rect_b)
|
||||
if r1[0] == 0:
|
||||
return 0
|
||||
else:
|
||||
inter_area = cv2.contourArea(r1[1])
|
||||
area_a = cv2.contourArea(box_a)
|
||||
area_b = cv2.contourArea(box_b)
|
||||
union_area = area_a + area_b - inter_area
|
||||
if union_area == 0 or inter_area == 0:
|
||||
return 0
|
||||
if method == "union":
|
||||
iou = inter_area / union_area
|
||||
elif method == "intersection":
|
||||
iou = inter_area / min(area_a, area_b)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return iou
|
||||
|
||||
|
||||
class DetectionIoUEvaluator(object):
|
||||
def __init__(
|
||||
self, is_output_polygon=False, iou_constraint=0.5, area_precision_constraint=0.5
|
||||
):
|
||||
self.is_output_polygon = is_output_polygon
|
||||
self.iou_constraint = iou_constraint
|
||||
self.area_precision_constraint = area_precision_constraint
|
||||
|
||||
def evaluate_image(self, gt, pred):
|
||||
def get_union(pD, pG):
|
||||
return Polygon(pD).union(Polygon(pG)).area
|
||||
|
||||
def get_intersection_over_union(pD, pG):
|
||||
return get_intersection(pD, pG) / get_union(pD, pG)
|
||||
|
||||
def get_intersection(pD, pG):
|
||||
return Polygon(pD).intersection(Polygon(pG)).area
|
||||
|
||||
def compute_ap(confList, matchList, numGtCare):
|
||||
correct = 0
|
||||
AP = 0
|
||||
if len(confList) > 0:
|
||||
confList = np.array(confList)
|
||||
matchList = np.array(matchList)
|
||||
sorted_ind = np.argsort(-confList)
|
||||
confList = confList[sorted_ind]
|
||||
matchList = matchList[sorted_ind]
|
||||
for n in range(len(confList)):
|
||||
match = matchList[n]
|
||||
if match:
|
||||
correct += 1
|
||||
AP += float(correct) / (n + 1)
|
||||
|
||||
if numGtCare > 0:
|
||||
AP /= numGtCare
|
||||
|
||||
return AP
|
||||
|
||||
perSampleMetrics = {}
|
||||
|
||||
matchedSum = 0
|
||||
|
||||
Rectangle = namedtuple("Rectangle", "xmin ymin xmax ymax")
|
||||
|
||||
numGlobalCareGt = 0
|
||||
numGlobalCareDet = 0
|
||||
|
||||
arrGlobalConfidences = []
|
||||
arrGlobalMatches = []
|
||||
|
||||
recall = 0
|
||||
precision = 0
|
||||
hmean = 0
|
||||
|
||||
detMatched = 0
|
||||
|
||||
iouMat = np.empty([1, 1])
|
||||
|
||||
gtPols = []
|
||||
detPols = []
|
||||
|
||||
gtPolPoints = []
|
||||
detPolPoints = []
|
||||
|
||||
# Array of Ground Truth Polygons' keys marked as don't Care
|
||||
gtDontCarePolsNum = []
|
||||
# Array of Detected Polygons' matched with a don't Care GT
|
||||
detDontCarePolsNum = []
|
||||
|
||||
pairs = []
|
||||
detMatchedNums = []
|
||||
|
||||
arrSampleConfidences = []
|
||||
arrSampleMatch = []
|
||||
|
||||
evaluationLog = ""
|
||||
|
||||
for n in range(len(gt)):
|
||||
points = gt[n]["points"]
|
||||
# transcription = gt[n]['text']
|
||||
dontCare = gt[n]["ignore"]
|
||||
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
gtPol = points
|
||||
gtPols.append(gtPol)
|
||||
gtPolPoints.append(points)
|
||||
if dontCare:
|
||||
gtDontCarePolsNum.append(len(gtPols) - 1)
|
||||
|
||||
evaluationLog += (
|
||||
"GT polygons: "
|
||||
+ str(len(gtPols))
|
||||
+ (
|
||||
" (" + str(len(gtDontCarePolsNum)) + " don't care)\n"
|
||||
if len(gtDontCarePolsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
for n in range(len(pred)):
|
||||
points = pred[n]["points"]
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
detPol = points
|
||||
detPols.append(detPol)
|
||||
detPolPoints.append(points)
|
||||
if len(gtDontCarePolsNum) > 0:
|
||||
for dontCarePol in gtDontCarePolsNum:
|
||||
dontCarePol = gtPols[dontCarePol]
|
||||
intersected_area = get_intersection(dontCarePol, detPol)
|
||||
pdDimensions = Polygon(detPol).area
|
||||
precision = (
|
||||
0 if pdDimensions == 0 else intersected_area / pdDimensions
|
||||
)
|
||||
if precision > self.area_precision_constraint:
|
||||
detDontCarePolsNum.append(len(detPols) - 1)
|
||||
break
|
||||
|
||||
evaluationLog += (
|
||||
"DET polygons: "
|
||||
+ str(len(detPols))
|
||||
+ (
|
||||
" (" + str(len(detDontCarePolsNum)) + " don't care)\n"
|
||||
if len(detDontCarePolsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
if len(gtPols) > 0 and len(detPols) > 0:
|
||||
# Calculate IoU and precision matrixs
|
||||
outputShape = [len(gtPols), len(detPols)]
|
||||
iouMat = np.empty(outputShape)
|
||||
gtRectMat = np.zeros(len(gtPols), np.int8)
|
||||
detRectMat = np.zeros(len(detPols), np.int8)
|
||||
if self.is_output_polygon:
|
||||
for gtNum in range(len(gtPols)):
|
||||
for detNum in range(len(detPols)):
|
||||
pG = gtPols[gtNum]
|
||||
pD = detPols[detNum]
|
||||
iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
|
||||
else:
|
||||
# gtPols = np.float32(gtPols)
|
||||
# detPols = np.float32(detPols)
|
||||
for gtNum in range(len(gtPols)):
|
||||
for detNum in range(len(detPols)):
|
||||
pG = np.float32(gtPols[gtNum])
|
||||
pD = np.float32(detPols[detNum])
|
||||
iouMat[gtNum, detNum] = iou_rotate(pD, pG)
|
||||
for gtNum in range(len(gtPols)):
|
||||
for detNum in range(len(detPols)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCarePolsNum
|
||||
and detNum not in detDontCarePolsNum
|
||||
):
|
||||
if iouMat[gtNum, detNum] > self.iou_constraint:
|
||||
gtRectMat[gtNum] = 1
|
||||
detRectMat[detNum] = 1
|
||||
detMatched += 1
|
||||
pairs.append({"gt": gtNum, "det": detNum})
|
||||
detMatchedNums.append(detNum)
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
numGtCare = len(gtPols) - len(gtDontCarePolsNum)
|
||||
numDetCare = len(detPols) - len(detDontCarePolsNum)
|
||||
if numGtCare == 0:
|
||||
recall = float(1)
|
||||
precision = float(0) if numDetCare > 0 else float(1)
|
||||
else:
|
||||
recall = float(detMatched) / numGtCare
|
||||
precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare
|
||||
|
||||
hmean = (
|
||||
0
|
||||
if (precision + recall) == 0
|
||||
else 2.0 * precision * recall / (precision + recall)
|
||||
)
|
||||
|
||||
matchedSum += detMatched
|
||||
numGlobalCareGt += numGtCare
|
||||
numGlobalCareDet += numDetCare
|
||||
|
||||
perSampleMetrics = {
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"hmean": hmean,
|
||||
"pairs": pairs,
|
||||
"iouMat": [] if len(detPols) > 100 else iouMat.tolist(),
|
||||
"gtPolPoints": gtPolPoints,
|
||||
"detPolPoints": detPolPoints,
|
||||
"gtCare": numGtCare,
|
||||
"detCare": numDetCare,
|
||||
"gtDontCare": gtDontCarePolsNum,
|
||||
"detDontCare": detDontCarePolsNum,
|
||||
"detMatched": detMatched,
|
||||
"evaluationLog": evaluationLog,
|
||||
}
|
||||
|
||||
return perSampleMetrics
|
||||
|
||||
def combine_results(self, results):
|
||||
numGlobalCareGt = 0
|
||||
numGlobalCareDet = 0
|
||||
matchedSum = 0
|
||||
for result in results:
|
||||
numGlobalCareGt += result["gtCare"]
|
||||
numGlobalCareDet += result["detCare"]
|
||||
matchedSum += result["detMatched"]
|
||||
|
||||
methodRecall = (
|
||||
0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt
|
||||
)
|
||||
methodPrecision = (
|
||||
0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet
|
||||
)
|
||||
methodHmean = (
|
||||
0
|
||||
if methodRecall + methodPrecision == 0
|
||||
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
|
||||
)
|
||||
|
||||
methodMetrics = {
|
||||
"precision": methodPrecision,
|
||||
"recall": methodRecall,
|
||||
"hmean": methodHmean,
|
||||
}
|
||||
|
||||
return methodMetrics
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
evaluator = DetectionIoUEvaluator()
|
||||
preds = [
|
||||
[
|
||||
{
|
||||
"points": [(0.1, 0.1), (0.5, 0), (0.5, 1), (0, 1)],
|
||||
"text": 1234,
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"points": [(0.5, 0.1), (1, 0), (1, 1), (0.5, 1)],
|
||||
"text": 5678,
|
||||
"ignore": False,
|
||||
},
|
||||
]
|
||||
]
|
||||
gts = [
|
||||
[
|
||||
{
|
||||
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
|
||||
"text": 123,
|
||||
"ignore": False,
|
||||
}
|
||||
]
|
||||
]
|
||||
results = []
|
||||
for gt, pred in zip(gts, preds):
|
||||
results.append(evaluator.evaluate_image(gt, pred))
|
||||
metrics = evaluator.combine_results(results)
|
||||
print(metrics)
|
||||
@ -0,0 +1,398 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
import math
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
from shapely.geometry import Polygon
|
||||
|
||||
|
||||
class DetectionMTWI2018Evaluator(object):
|
||||
def __init__(
|
||||
self,
|
||||
area_recall_constraint=0.7,
|
||||
area_precision_constraint=0.7,
|
||||
ev_param_ind_center_diff_thr=1,
|
||||
):
|
||||
self.area_recall_constraint = area_recall_constraint
|
||||
self.area_precision_constraint = area_precision_constraint
|
||||
self.ev_param_ind_center_diff_thr = ev_param_ind_center_diff_thr
|
||||
|
||||
def evaluate_image(self, gt, pred):
|
||||
def get_union(pD, pG):
|
||||
return Polygon(pD).union(Polygon(pG)).area
|
||||
|
||||
def get_intersection_over_union(pD, pG):
|
||||
return get_intersection(pD, pG) / get_union(pD, pG)
|
||||
|
||||
def get_intersection(pD, pG):
|
||||
return Polygon(pD).intersection(Polygon(pG)).area
|
||||
|
||||
def one_to_one_match(row, col):
|
||||
cont = 0
|
||||
for j in range(len(recallMat[0])):
|
||||
if (
|
||||
recallMat[row, j] >= self.area_recall_constraint
|
||||
and precisionMat[row, j] >= self.area_precision_constraint
|
||||
):
|
||||
cont = cont + 1
|
||||
if cont != 1:
|
||||
return False
|
||||
cont = 0
|
||||
for i in range(len(recallMat)):
|
||||
if (
|
||||
recallMat[i, col] >= self.area_recall_constraint
|
||||
and precisionMat[i, col] >= self.area_precision_constraint
|
||||
):
|
||||
cont = cont + 1
|
||||
if cont != 1:
|
||||
return False
|
||||
|
||||
if (
|
||||
recallMat[row, col] >= self.area_recall_constraint
|
||||
and precisionMat[row, col] >= self.area_precision_constraint
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
def one_to_many_match(gtNum):
|
||||
many_sum = 0
|
||||
detRects = []
|
||||
for detNum in range(len(recallMat[0])):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and detNum not in detDontCareRectsNum
|
||||
):
|
||||
if precisionMat[gtNum, detNum] >= self.area_precision_constraint:
|
||||
many_sum += recallMat[gtNum, detNum]
|
||||
detRects.append(detNum)
|
||||
if round(many_sum, 4) >= self.area_recall_constraint:
|
||||
return True, detRects
|
||||
else:
|
||||
return False, []
|
||||
|
||||
def many_to_one_match(detNum):
|
||||
many_sum = 0
|
||||
gtRects = []
|
||||
for gtNum in range(len(recallMat)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCareRectsNum
|
||||
):
|
||||
if recallMat[gtNum, detNum] >= self.area_recall_constraint:
|
||||
many_sum += precisionMat[gtNum, detNum]
|
||||
gtRects.append(gtNum)
|
||||
if round(many_sum, 4) >= self.area_precision_constraint:
|
||||
return True, gtRects
|
||||
else:
|
||||
return False, []
|
||||
|
||||
def center_distance(r1, r2):
|
||||
return ((np.mean(r1, axis=0) - np.mean(r2, axis=0)) ** 2).sum() ** 0.5
|
||||
|
||||
def diag(r):
|
||||
r = np.array(r)
|
||||
return (
|
||||
(r[:, 0].max() - r[:, 0].min()) ** 2
|
||||
+ (r[:, 1].max() - r[:, 1].min()) ** 2
|
||||
) ** 0.5
|
||||
|
||||
perSampleMetrics = {}
|
||||
|
||||
recall = 0
|
||||
precision = 0
|
||||
hmean = 0
|
||||
recallAccum = 0.0
|
||||
precisionAccum = 0.0
|
||||
gtRects = []
|
||||
detRects = []
|
||||
gtPolPoints = []
|
||||
detPolPoints = []
|
||||
gtDontCareRectsNum = (
|
||||
[]
|
||||
) # Array of Ground Truth Rectangles' keys marked as don't Care
|
||||
detDontCareRectsNum = (
|
||||
[]
|
||||
) # Array of Detected Rectangles' matched with a don't Care GT
|
||||
pairs = []
|
||||
evaluationLog = ""
|
||||
|
||||
recallMat = np.empty([1, 1])
|
||||
precisionMat = np.empty([1, 1])
|
||||
|
||||
for n in range(len(gt)):
|
||||
points = gt[n]["points"]
|
||||
# transcription = gt[n]['text']
|
||||
dontCare = gt[n]["ignore"]
|
||||
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
gtRects.append(points)
|
||||
gtPolPoints.append(points)
|
||||
if dontCare:
|
||||
gtDontCareRectsNum.append(len(gtRects) - 1)
|
||||
|
||||
evaluationLog += (
|
||||
"GT rectangles: "
|
||||
+ str(len(gtRects))
|
||||
+ (
|
||||
" (" + str(len(gtDontCareRectsNum)) + " don't care)\n"
|
||||
if len(gtDontCareRectsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
for n in range(len(pred)):
|
||||
points = pred[n]["points"]
|
||||
|
||||
if not Polygon(points).is_valid or not Polygon(points).is_simple:
|
||||
continue
|
||||
|
||||
detRect = points
|
||||
detRects.append(detRect)
|
||||
detPolPoints.append(points)
|
||||
if len(gtDontCareRectsNum) > 0:
|
||||
for dontCareRectNum in gtDontCareRectsNum:
|
||||
dontCareRect = gtRects[dontCareRectNum]
|
||||
intersected_area = get_intersection(dontCareRect, detRect)
|
||||
rdDimensions = Polygon(detRect).area
|
||||
if rdDimensions == 0:
|
||||
precision = 0
|
||||
else:
|
||||
precision = intersected_area / rdDimensions
|
||||
if precision > 0.5:
|
||||
detDontCareRectsNum.append(len(detRects) - 1)
|
||||
break
|
||||
|
||||
evaluationLog += (
|
||||
"DET rectangles: "
|
||||
+ str(len(detRects))
|
||||
+ (
|
||||
" (" + str(len(detDontCareRectsNum)) + " don't care)\n"
|
||||
if len(detDontCareRectsNum) > 0
|
||||
else "\n"
|
||||
)
|
||||
)
|
||||
|
||||
if len(gtRects) == 0:
|
||||
recall = 1
|
||||
precision = 0 if len(detRects) > 0 else 1
|
||||
|
||||
if len(detRects) > 0:
|
||||
# Calculate recall and precision matrixs
|
||||
outputShape = [len(gtRects), len(detRects)]
|
||||
recallMat = np.empty(outputShape)
|
||||
precisionMat = np.empty(outputShape)
|
||||
gtRectMat = np.zeros(len(gtRects), np.int8)
|
||||
detRectMat = np.zeros(len(detRects), np.int8)
|
||||
for gtNum in range(len(gtRects)):
|
||||
for detNum in range(len(detRects)):
|
||||
rG = gtRects[gtNum]
|
||||
rD = detRects[detNum]
|
||||
intersected_area = get_intersection(rG, rD)
|
||||
rgDimensions = Polygon(rG).area
|
||||
rdDimensions = Polygon(rD).area
|
||||
recallMat[gtNum, detNum] = (
|
||||
0 if rgDimensions == 0 else intersected_area / rgDimensions
|
||||
)
|
||||
precisionMat[gtNum, detNum] = (
|
||||
0 if rdDimensions == 0 else intersected_area / rdDimensions
|
||||
)
|
||||
|
||||
# Find one-to-one matches
|
||||
evaluationLog += "Find one-to-one matches\n"
|
||||
for gtNum in range(len(gtRects)):
|
||||
for detNum in range(len(detRects)):
|
||||
if (
|
||||
gtRectMat[gtNum] == 0
|
||||
and detRectMat[detNum] == 0
|
||||
and gtNum not in gtDontCareRectsNum
|
||||
and detNum not in detDontCareRectsNum
|
||||
):
|
||||
match = one_to_one_match(gtNum, detNum)
|
||||
if match is True:
|
||||
# in deteval we have to make other validation before mark as one-to-one
|
||||
rG = gtRects[gtNum]
|
||||
rD = detRects[detNum]
|
||||
normDist = center_distance(rG, rD)
|
||||
normDist /= diag(rG) + diag(rD)
|
||||
normDist *= 2.0
|
||||
if normDist < self.ev_param_ind_center_diff_thr:
|
||||
gtRectMat[gtNum] = 1
|
||||
detRectMat[detNum] = 1
|
||||
recallAccum += 1.0
|
||||
precisionAccum += 1.0
|
||||
pairs.append({"gt": gtNum, "det": detNum, "type": "OO"})
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
else:
|
||||
evaluationLog += (
|
||||
"Match Discarded GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ " normDist: "
|
||||
+ str(normDist)
|
||||
+ " \n"
|
||||
)
|
||||
# Find one-to-many matches
|
||||
evaluationLog += "Find one-to-many matches\n"
|
||||
for gtNum in range(len(gtRects)):
|
||||
if gtNum not in gtDontCareRectsNum:
|
||||
match, matchesDet = one_to_many_match(gtNum)
|
||||
if match is True:
|
||||
gtRectMat[gtNum] = 1
|
||||
recallAccum += 1.0
|
||||
precisionAccum += len(matchesDet) / (
|
||||
1 + math.log(len(matchesDet))
|
||||
)
|
||||
pairs.append(
|
||||
{
|
||||
"gt": gtNum,
|
||||
"det": matchesDet,
|
||||
"type": "OO" if len(matchesDet) == 1 else "OM",
|
||||
}
|
||||
)
|
||||
for detNum in matchesDet:
|
||||
detRectMat[detNum] = 1
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(gtNum)
|
||||
+ " with Det #"
|
||||
+ str(matchesDet)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
# Find many-to-one matches
|
||||
evaluationLog += "Find many-to-one matches\n"
|
||||
for detNum in range(len(detRects)):
|
||||
if detNum not in detDontCareRectsNum:
|
||||
match, matchesGt = many_to_one_match(detNum)
|
||||
if match is True:
|
||||
detRectMat[detNum] = 1
|
||||
recallAccum += len(matchesGt) / (1 + math.log(len(matchesGt)))
|
||||
precisionAccum += 1.0
|
||||
pairs.append(
|
||||
{
|
||||
"gt": matchesGt,
|
||||
"det": detNum,
|
||||
"type": "OO" if len(matchesGt) == 1 else "MO",
|
||||
}
|
||||
)
|
||||
for gtNum in matchesGt:
|
||||
gtRectMat[gtNum] = 1
|
||||
evaluationLog += (
|
||||
"Match GT #"
|
||||
+ str(matchesGt)
|
||||
+ " with Det #"
|
||||
+ str(detNum)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
|
||||
if numGtCare == 0:
|
||||
recall = float(1)
|
||||
precision = float(0) if len(detRects) > 0 else float(1)
|
||||
else:
|
||||
recall = float(recallAccum) / numGtCare
|
||||
precision = (
|
||||
float(0)
|
||||
if (len(detRects) - len(detDontCareRectsNum)) == 0
|
||||
else float(precisionAccum)
|
||||
/ (len(detRects) - len(detDontCareRectsNum))
|
||||
)
|
||||
hmean = (
|
||||
0
|
||||
if (precision + recall) == 0
|
||||
else 2.0 * precision * recall / (precision + recall)
|
||||
)
|
||||
|
||||
numGtCare = len(gtRects) - len(gtDontCareRectsNum)
|
||||
numDetCare = len(detRects) - len(detDontCareRectsNum)
|
||||
|
||||
perSampleMetrics = {
|
||||
"precision": precision,
|
||||
"recall": recall,
|
||||
"hmean": hmean,
|
||||
"pairs": pairs,
|
||||
"recallMat": [] if len(detRects) > 100 else recallMat.tolist(),
|
||||
"precisionMat": [] if len(detRects) > 100 else precisionMat.tolist(),
|
||||
"gtPolPoints": gtPolPoints,
|
||||
"detPolPoints": detPolPoints,
|
||||
"gtCare": numGtCare,
|
||||
"detCare": numDetCare,
|
||||
"gtDontCare": gtDontCareRectsNum,
|
||||
"detDontCare": detDontCareRectsNum,
|
||||
"recallAccum": recallAccum,
|
||||
"precisionAccum": precisionAccum,
|
||||
"evaluationLog": evaluationLog,
|
||||
}
|
||||
|
||||
return perSampleMetrics
|
||||
|
||||
def combine_results(self, results):
|
||||
numGt = 0
|
||||
numDet = 0
|
||||
methodRecallSum = 0
|
||||
methodPrecisionSum = 0
|
||||
|
||||
for result in results:
|
||||
numGt += result["gtCare"]
|
||||
numDet += result["detCare"]
|
||||
methodRecallSum += result["recallAccum"]
|
||||
methodPrecisionSum += result["precisionAccum"]
|
||||
|
||||
methodRecall = 0 if numGt == 0 else methodRecallSum / numGt
|
||||
methodPrecision = 0 if numDet == 0 else methodPrecisionSum / numDet
|
||||
methodHmean = (
|
||||
0
|
||||
if methodRecall + methodPrecision == 0
|
||||
else 2 * methodRecall * methodPrecision / (methodRecall + methodPrecision)
|
||||
)
|
||||
|
||||
methodMetrics = {
|
||||
"precision": methodPrecision,
|
||||
"recall": methodRecall,
|
||||
"hmean": methodHmean,
|
||||
}
|
||||
|
||||
return methodMetrics
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
evaluator = DetectionICDAR2013Evaluator()
|
||||
gts = [
|
||||
[
|
||||
{
|
||||
"points": [(0, 0), (1, 0), (1, 1), (0, 1)],
|
||||
"text": 1234,
|
||||
"ignore": False,
|
||||
},
|
||||
{
|
||||
"points": [(2, 2), (3, 2), (3, 3), (2, 3)],
|
||||
"text": 5678,
|
||||
"ignore": True,
|
||||
},
|
||||
]
|
||||
]
|
||||
preds = [
|
||||
[
|
||||
{
|
||||
"points": [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
|
||||
"text": 123,
|
||||
"ignore": False,
|
||||
}
|
||||
]
|
||||
]
|
||||
results = []
|
||||
for gt, pred in zip(gts, preds):
|
||||
results.append(evaluator.evaluate_image(gt, pred))
|
||||
metrics = evaluator.combine_results(results)
|
||||
print(metrics)
|
||||
@ -0,0 +1,100 @@
|
||||
import numpy as np
|
||||
|
||||
from .detection.iou import DetectionIoUEvaluator
|
||||
|
||||
|
||||
class AverageMeter(object):
|
||||
"""Computes and stores the average and current value"""
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
self.val = 0
|
||||
self.avg = 0
|
||||
self.sum = 0
|
||||
self.count = 0
|
||||
|
||||
def update(self, val, n=1):
|
||||
self.val = val
|
||||
self.sum += val * n
|
||||
self.count += n
|
||||
self.avg = self.sum / self.count
|
||||
return self
|
||||
|
||||
|
||||
class QuadMetric:
|
||||
def __init__(self, is_output_polygon=False):
|
||||
self.is_output_polygon = is_output_polygon
|
||||
self.evaluator = DetectionIoUEvaluator(is_output_polygon=is_output_polygon)
|
||||
|
||||
def measure(self, batch, output, box_thresh=0.6):
|
||||
"""
|
||||
batch: (image, polygons, ignore_tags
|
||||
batch: a dict produced by dataloaders.
|
||||
image: tensor of shape (N, C, H, W).
|
||||
polygons: tensor of shape (N, K, 4, 2), the polygons of objective regions.
|
||||
ignore_tags: tensor of shape (N, K), indicates whether a region is ignorable or not.
|
||||
shape: the original shape of images.
|
||||
filename: the original filenames of images.
|
||||
output: (polygons, ...)
|
||||
"""
|
||||
results = []
|
||||
gt_polyons_batch = batch["text_polys"]
|
||||
ignore_tags_batch = batch["ignore_tags"]
|
||||
pred_polygons_batch = np.array(output[0])
|
||||
pred_scores_batch = np.array(output[1])
|
||||
for polygons, pred_polygons, pred_scores, ignore_tags in zip(
|
||||
gt_polyons_batch, pred_polygons_batch, pred_scores_batch, ignore_tags_batch
|
||||
):
|
||||
gt = [
|
||||
dict(points=np.int64(polygons[i]), ignore=ignore_tags[i])
|
||||
for i in range(len(polygons))
|
||||
]
|
||||
if self.is_output_polygon:
|
||||
pred = [
|
||||
dict(points=pred_polygons[i]) for i in range(len(pred_polygons))
|
||||
]
|
||||
else:
|
||||
pred = []
|
||||
# print(pred_polygons.shape)
|
||||
for i in range(pred_polygons.shape[0]):
|
||||
if pred_scores[i] >= box_thresh:
|
||||
# print(pred_polygons[i,:,:].tolist())
|
||||
pred.append(
|
||||
dict(points=pred_polygons[i, :, :].astype(np.int32))
|
||||
)
|
||||
# pred = [dict(points=pred_polygons[i,:,:].tolist()) if pred_scores[i] >= box_thresh for i in range(pred_polygons.shape[0])]
|
||||
results.append(self.evaluator.evaluate_image(gt, pred))
|
||||
return results
|
||||
|
||||
def validate_measure(self, batch, output, box_thresh=0.6):
|
||||
return self.measure(batch, output, box_thresh)
|
||||
|
||||
def evaluate_measure(self, batch, output):
|
||||
return (
|
||||
self.measure(batch, output),
|
||||
np.linspace(0, batch["image"].shape[0]).tolist(),
|
||||
)
|
||||
|
||||
def gather_measure(self, raw_metrics):
|
||||
raw_metrics = [
|
||||
image_metrics
|
||||
for batch_metrics in raw_metrics
|
||||
for image_metrics in batch_metrics
|
||||
]
|
||||
|
||||
result = self.evaluator.combine_results(raw_metrics)
|
||||
|
||||
precision = AverageMeter()
|
||||
recall = AverageMeter()
|
||||
fmeasure = AverageMeter()
|
||||
|
||||
precision.update(result["precision"], n=len(raw_metrics))
|
||||
recall.update(result["recall"], n=len(raw_metrics))
|
||||
fmeasure_score = (
|
||||
2 * precision.val * recall.val / (precision.val + recall.val + 1e-8)
|
||||
)
|
||||
fmeasure.update(fmeasure_score)
|
||||
|
||||
return {"precision": precision, "recall": recall, "fmeasure": fmeasure}
|
||||
112
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/utils/profiler.py
Normal file
112
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/utils/profiler.py
Normal file
@ -0,0 +1,112 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import paddle
|
||||
|
||||
# A global variable to record the number of calling times for profiler
|
||||
# functions. It is used to specify the tracing range of training steps.
|
||||
_profiler_step_id = 0
|
||||
|
||||
# A global variable to avoid parsing from string every time.
|
||||
_profiler_options = None
|
||||
|
||||
|
||||
class ProfilerOptions(object):
|
||||
"""
|
||||
Use a string to initialize a ProfilerOptions.
|
||||
The string should be in the format: "key1=value1;key2=value;key3=value3".
|
||||
For example:
|
||||
"profile_path=model.profile"
|
||||
"batch_range=[50, 60]; profile_path=model.profile"
|
||||
"batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
|
||||
ProfilerOptions supports following key-value pair:
|
||||
batch_range - a integer list, e.g. [100, 110].
|
||||
state - a string, the optional values are 'CPU', 'GPU' or 'All'.
|
||||
sorted_key - a string, the optional values are 'calls', 'total',
|
||||
'max', 'min' or 'ave.
|
||||
tracer_option - a string, the optional values are 'Default', 'OpDetail',
|
||||
'AllOpDetail'.
|
||||
profile_path - a string, the path to save the serialized profile data,
|
||||
which can be used to generate a timeline.
|
||||
exit_on_finished - a boolean.
|
||||
"""
|
||||
|
||||
def __init__(self, options_str):
|
||||
assert isinstance(options_str, str)
|
||||
|
||||
self._options = {
|
||||
"batch_range": [10, 20],
|
||||
"state": "All",
|
||||
"sorted_key": "total",
|
||||
"tracer_option": "Default",
|
||||
"profile_path": "/tmp/profile",
|
||||
"exit_on_finished": True,
|
||||
}
|
||||
self._parse_from_string(options_str)
|
||||
|
||||
def _parse_from_string(self, options_str):
|
||||
for kv in options_str.replace(" ", "").split(";"):
|
||||
key, value = kv.split("=")
|
||||
if key == "batch_range":
|
||||
value_list = value.replace("[", "").replace("]", "").split(",")
|
||||
value_list = list(map(int, value_list))
|
||||
if (
|
||||
len(value_list) >= 2
|
||||
and value_list[0] >= 0
|
||||
and value_list[1] > value_list[0]
|
||||
):
|
||||
self._options[key] = value_list
|
||||
elif key == "exit_on_finished":
|
||||
self._options[key] = value.lower() in ("yes", "true", "t", "1")
|
||||
elif key in ["state", "sorted_key", "tracer_option", "profile_path"]:
|
||||
self._options[key] = value
|
||||
|
||||
def __getitem__(self, name):
|
||||
if self._options.get(name, None) is None:
|
||||
raise ValueError("ProfilerOptions does not have an option named %s." % name)
|
||||
return self._options[name]
|
||||
|
||||
|
||||
def add_profiler_step(options_str=None):
|
||||
"""
|
||||
Enable the operator-level timing using PaddlePaddle's profiler.
|
||||
The profiler uses a independent variable to count the profiler steps.
|
||||
One call of this function is treated as a profiler step.
|
||||
|
||||
Args:
|
||||
profiler_options - a string to initialize the ProfilerOptions.
|
||||
Default is None, and the profiler is disabled.
|
||||
"""
|
||||
if options_str is None:
|
||||
return
|
||||
|
||||
global _profiler_step_id
|
||||
global _profiler_options
|
||||
|
||||
if _profiler_options is None:
|
||||
_profiler_options = ProfilerOptions(options_str)
|
||||
|
||||
if _profiler_step_id == _profiler_options["batch_range"][0]:
|
||||
paddle.utils.profiler.start_profiler(
|
||||
_profiler_options["state"], _profiler_options["tracer_option"]
|
||||
)
|
||||
elif _profiler_step_id == _profiler_options["batch_range"][1]:
|
||||
paddle.utils.profiler.stop_profiler(
|
||||
_profiler_options["sorted_key"], _profiler_options["profile_path"]
|
||||
)
|
||||
if _profiler_options["exit_on_finished"]:
|
||||
sys.exit(0)
|
||||
|
||||
_profiler_step_id += 1
|
||||
@ -0,0 +1,72 @@
|
||||
from paddle.optimizer import lr
|
||||
import logging
|
||||
|
||||
__all__ = ["Polynomial"]
|
||||
|
||||
|
||||
class Polynomial(object):
|
||||
"""
|
||||
Polynomial learning rate decay
|
||||
Args:
|
||||
learning_rate (float): The initial learning rate. It is a python float number.
|
||||
epochs(int): The decay epoch size. It determines the decay cycle, when by_epoch is set to true, it will change to epochs=epochs*step_each_epoch.
|
||||
step_each_epoch: all steps in each epoch.
|
||||
end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
|
||||
power(float, optional): Power of polynomial. Default: 1.0.
|
||||
warmup_epoch(int): The epoch numbers for LinearWarmup. Default: 0, , when by_epoch is set to true, it will change to warmup_epoch=warmup_epoch*step_each_epoch.
|
||||
warmup_start_lr(float): Initial learning rate of warm up. Default: 0.0.
|
||||
last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
|
||||
by_epoch: Whether the set parameter is based on epoch or iter, when set to true,, epochs and warmup_epoch will be automatically multiplied by step_each_epoch. Default: True
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
learning_rate,
|
||||
epochs,
|
||||
step_each_epoch,
|
||||
end_lr=0.0,
|
||||
power=1.0,
|
||||
warmup_epoch=0,
|
||||
warmup_start_lr=0.0,
|
||||
last_epoch=-1,
|
||||
by_epoch=True,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__()
|
||||
if warmup_epoch >= epochs:
|
||||
msg = f'When using warm up, the value of "epochs" must be greater than value of "Optimizer.lr.warmup_epoch". The value of "Optimizer.lr.warmup_epoch" has been set to {epochs}.'
|
||||
logging.warning(msg)
|
||||
warmup_epoch = epochs
|
||||
self.learning_rate = learning_rate
|
||||
self.epochs = epochs
|
||||
self.end_lr = end_lr
|
||||
self.power = power
|
||||
self.last_epoch = last_epoch
|
||||
self.warmup_epoch = warmup_epoch
|
||||
self.warmup_start_lr = warmup_start_lr
|
||||
|
||||
if by_epoch:
|
||||
self.epochs *= step_each_epoch
|
||||
self.warmup_epoch = int(self.warmup_epoch * step_each_epoch)
|
||||
|
||||
def __call__(self):
|
||||
learning_rate = (
|
||||
lr.PolynomialDecay(
|
||||
learning_rate=self.learning_rate,
|
||||
decay_steps=self.epochs,
|
||||
end_lr=self.end_lr,
|
||||
power=self.power,
|
||||
last_epoch=self.last_epoch,
|
||||
)
|
||||
if self.epochs > 0
|
||||
else self.learning_rate
|
||||
)
|
||||
if self.warmup_epoch > 0:
|
||||
learning_rate = lr.LinearWarmup(
|
||||
learning_rate=learning_rate,
|
||||
warmup_steps=self.warmup_epoch,
|
||||
start_lr=self.warmup_start_lr,
|
||||
end_lr=self.learning_rate,
|
||||
last_epoch=self.last_epoch,
|
||||
)
|
||||
return learning_rate
|
||||
365
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/utils/util.py
Normal file
365
PaddleOCR-3.1.0/benchmark/PaddleOCR_DBNet/utils/util.py
Normal file
@ -0,0 +1,365 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time : 2019/8/23 21:59
|
||||
# @Author : zhoujun
|
||||
import json
|
||||
import pathlib
|
||||
import time
|
||||
import os
|
||||
import glob
|
||||
import cv2
|
||||
import yaml
|
||||
from typing import Mapping
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
|
||||
def _check_image_file(path):
|
||||
img_end = {"jpg", "bmp", "png", "jpeg", "rgb", "tif", "tiff", "gif", "pdf"}
|
||||
return any([path.lower().endswith(e) for e in img_end])
|
||||
|
||||
|
||||
def get_image_file_list(img_file):
|
||||
imgs_lists = []
|
||||
if img_file is None or not os.path.exists(img_file):
|
||||
raise Exception("not found any img file in {}".format(img_file))
|
||||
|
||||
img_end = {"jpg", "bmp", "png", "jpeg", "rgb", "tif", "tiff", "gif", "pdf"}
|
||||
if os.path.isfile(img_file) and _check_image_file(img_file):
|
||||
imgs_lists.append(img_file)
|
||||
elif os.path.isdir(img_file):
|
||||
for single_file in os.listdir(img_file):
|
||||
file_path = os.path.join(img_file, single_file)
|
||||
if os.path.isfile(file_path) and _check_image_file(file_path):
|
||||
imgs_lists.append(file_path)
|
||||
if len(imgs_lists) == 0:
|
||||
raise Exception("not found any img file in {}".format(img_file))
|
||||
imgs_lists = sorted(imgs_lists)
|
||||
return imgs_lists
|
||||
|
||||
|
||||
def setup_logger(log_file_path: str = None):
|
||||
import logging
|
||||
|
||||
logging._warn_preinit_stderr = 0
|
||||
logger = logging.getLogger("DBNet.paddle")
|
||||
formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
|
||||
ch = logging.StreamHandler()
|
||||
ch.setFormatter(formatter)
|
||||
logger.addHandler(ch)
|
||||
if log_file_path is not None:
|
||||
file_handle = logging.FileHandler(log_file_path)
|
||||
file_handle.setFormatter(formatter)
|
||||
logger.addHandler(file_handle)
|
||||
logger.setLevel(logging.DEBUG)
|
||||
return logger
|
||||
|
||||
|
||||
# --exeTime
|
||||
def exe_time(func):
|
||||
def newFunc(*args, **args2):
|
||||
t0 = time.time()
|
||||
back = func(*args, **args2)
|
||||
print("{} cost {:.3f}s".format(func.__name__, time.time() - t0))
|
||||
return back
|
||||
|
||||
return newFunc
|
||||
|
||||
|
||||
def load(file_path: str):
|
||||
file_path = pathlib.Path(file_path)
|
||||
func_dict = {".txt": _load_txt, ".json": _load_json, ".list": _load_txt}
|
||||
assert file_path.suffix in func_dict
|
||||
return func_dict[file_path.suffix](file_path)
|
||||
|
||||
|
||||
def _load_txt(file_path: str):
|
||||
with open(file_path, "r", encoding="utf8") as f:
|
||||
content = [
|
||||
x.strip().strip("\ufeff").strip("\xef\xbb\xbf") for x in f.readlines()
|
||||
]
|
||||
return content
|
||||
|
||||
|
||||
def _load_json(file_path: str):
|
||||
with open(file_path, "r", encoding="utf8") as f:
|
||||
content = json.load(f)
|
||||
return content
|
||||
|
||||
|
||||
def save(data, file_path):
|
||||
file_path = pathlib.Path(file_path)
|
||||
func_dict = {".txt": _save_txt, ".json": _save_json}
|
||||
assert file_path.suffix in func_dict
|
||||
return func_dict[file_path.suffix](data, file_path)
|
||||
|
||||
|
||||
def _save_txt(data, file_path):
|
||||
"""
|
||||
将一个list的数组写入txt文件里
|
||||
:param data:
|
||||
:param file_path:
|
||||
:return:
|
||||
"""
|
||||
if not isinstance(data, list):
|
||||
data = [data]
|
||||
with open(file_path, mode="w", encoding="utf8") as f:
|
||||
f.write("\n".join(data))
|
||||
|
||||
|
||||
def _save_json(data, file_path):
|
||||
with open(file_path, "w", encoding="utf-8") as json_file:
|
||||
json.dump(data, json_file, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
def show_img(imgs: np.ndarray, title="img"):
|
||||
color = len(imgs.shape) == 3 and imgs.shape[-1] == 3
|
||||
imgs = np.expand_dims(imgs, axis=0)
|
||||
for i, img in enumerate(imgs):
|
||||
plt.figure()
|
||||
plt.title("{}_{}".format(title, i))
|
||||
plt.imshow(img, cmap=None if color else "gray")
|
||||
plt.show()
|
||||
|
||||
|
||||
def draw_bbox(img_path, result, color=(255, 0, 0), thickness=2):
|
||||
if isinstance(img_path, str):
|
||||
img_path = cv2.imread(img_path)
|
||||
# img_path = cv2.cvtColor(img_path, cv2.COLOR_BGR2RGB)
|
||||
img_path = img_path.copy()
|
||||
for point in result:
|
||||
point = point.astype(int)
|
||||
cv2.polylines(img_path, [point], True, color, thickness)
|
||||
return img_path
|
||||
|
||||
|
||||
def cal_text_score(texts, gt_texts, training_masks, running_metric_text, thred=0.5):
|
||||
training_masks = training_masks.numpy()
|
||||
pred_text = texts.numpy() * training_masks
|
||||
pred_text[pred_text <= thred] = 0
|
||||
pred_text[pred_text > thred] = 1
|
||||
pred_text = pred_text.astype(np.int32)
|
||||
gt_text = gt_texts.numpy() * training_masks
|
||||
gt_text = gt_text.astype(np.int32)
|
||||
running_metric_text.update(gt_text, pred_text)
|
||||
score_text, _ = running_metric_text.get_scores()
|
||||
return score_text
|
||||
|
||||
|
||||
def order_points_clockwise(pts):
|
||||
rect = np.zeros((4, 2), dtype="float32")
|
||||
s = pts.sum(axis=1)
|
||||
rect[0] = pts[np.argmin(s)]
|
||||
rect[2] = pts[np.argmax(s)]
|
||||
diff = np.diff(pts, axis=1)
|
||||
rect[1] = pts[np.argmin(diff)]
|
||||
rect[3] = pts[np.argmax(diff)]
|
||||
return rect
|
||||
|
||||
|
||||
def order_points_clockwise_list(pts):
|
||||
pts = pts.tolist()
|
||||
pts.sort(key=lambda x: (x[1], x[0]))
|
||||
pts[:2] = sorted(pts[:2], key=lambda x: x[0])
|
||||
pts[2:] = sorted(pts[2:], key=lambda x: -x[0])
|
||||
pts = np.array(pts)
|
||||
return pts
|
||||
|
||||
|
||||
def get_datalist(train_data_path):
|
||||
"""
|
||||
获取训练和验证的数据list
|
||||
:param train_data_path: 训练的dataset文件列表,每个文件内以如下格式存储 ‘path/to/img\tlabel’
|
||||
:return:
|
||||
"""
|
||||
train_data = []
|
||||
for p in train_data_path:
|
||||
with open(p, "r", encoding="utf-8") as f:
|
||||
for line in f.readlines():
|
||||
line = line.strip("\n").replace(".jpg ", ".jpg\t").split("\t")
|
||||
if len(line) > 1:
|
||||
img_path = pathlib.Path(line[0].strip(" "))
|
||||
label_path = pathlib.Path(line[1].strip(" "))
|
||||
if (
|
||||
img_path.exists()
|
||||
and img_path.stat().st_size > 0
|
||||
and label_path.exists()
|
||||
and label_path.stat().st_size > 0
|
||||
):
|
||||
train_data.append((str(img_path), str(label_path)))
|
||||
return train_data
|
||||
|
||||
|
||||
def save_result(result_path, box_list, score_list, is_output_polygon):
|
||||
if is_output_polygon:
|
||||
with open(result_path, "wt") as res:
|
||||
for i, box in enumerate(box_list):
|
||||
box = box.reshape(-1).tolist()
|
||||
result = ",".join([str(int(x)) for x in box])
|
||||
score = score_list[i]
|
||||
res.write(result + "," + str(score) + "\n")
|
||||
else:
|
||||
with open(result_path, "wt") as res:
|
||||
for i, box in enumerate(box_list):
|
||||
score = score_list[i]
|
||||
box = box.reshape(-1).tolist()
|
||||
result = ",".join([str(int(x)) for x in box])
|
||||
res.write(result + "," + str(score) + "\n")
|
||||
|
||||
|
||||
def expand_polygon(polygon):
|
||||
"""
|
||||
对只有一个字符的框进行扩充
|
||||
"""
|
||||
(x, y), (w, h), angle = cv2.minAreaRect(np.float32(polygon))
|
||||
if angle < -45:
|
||||
w, h = h, w
|
||||
angle += 90
|
||||
new_w = w + h
|
||||
box = ((x, y), (new_w, h), angle)
|
||||
points = cv2.boxPoints(box)
|
||||
return order_points_clockwise(points)
|
||||
|
||||
|
||||
def _merge_dict(config, merge_dct):
|
||||
"""Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
|
||||
updating only top-level keys, dict_merge recurses down into dicts nested
|
||||
to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
|
||||
``dct``.
|
||||
Args:
|
||||
config: dict onto which the merge is executed
|
||||
merge_dct: dct merged into config
|
||||
Returns: dct
|
||||
"""
|
||||
for key, value in merge_dct.items():
|
||||
sub_keys = key.split(".")
|
||||
key = sub_keys[0]
|
||||
if key in config and len(sub_keys) > 1:
|
||||
_merge_dict(config[key], {".".join(sub_keys[1:]): value})
|
||||
elif (
|
||||
key in config
|
||||
and isinstance(config[key], dict)
|
||||
and isinstance(value, Mapping)
|
||||
):
|
||||
_merge_dict(config[key], value)
|
||||
else:
|
||||
config[key] = value
|
||||
return config
|
||||
|
||||
|
||||
def print_dict(cfg, print_func=print, delimiter=0):
|
||||
"""
|
||||
Recursively visualize a dict and
|
||||
indenting acrrording by the relationship of keys.
|
||||
"""
|
||||
for k, v in sorted(cfg.items()):
|
||||
if isinstance(v, dict):
|
||||
print_func("{}{} : ".format(delimiter * " ", str(k)))
|
||||
print_dict(v, print_func, delimiter + 4)
|
||||
elif isinstance(v, list) and len(v) >= 1 and isinstance(v[0], dict):
|
||||
print_func("{}{} : ".format(delimiter * " ", str(k)))
|
||||
for value in v:
|
||||
print_dict(value, print_func, delimiter + 4)
|
||||
else:
|
||||
print_func("{}{} : {}".format(delimiter * " ", k, v))
|
||||
|
||||
|
||||
class Config(object):
|
||||
def __init__(self, config_path, BASE_KEY="base"):
|
||||
self.BASE_KEY = BASE_KEY
|
||||
self.cfg = self._load_config_with_base(config_path)
|
||||
|
||||
def _load_config_with_base(self, file_path):
|
||||
"""
|
||||
Load config from file.
|
||||
Args:
|
||||
file_path (str): Path of the config file to be loaded.
|
||||
Returns: global config
|
||||
"""
|
||||
_, ext = os.path.splitext(file_path)
|
||||
assert ext in [".yml", ".yaml"], "only support yaml files for now"
|
||||
|
||||
with open(file_path) as f:
|
||||
file_cfg = yaml.load(f, Loader=yaml.Loader)
|
||||
|
||||
# NOTE: cfgs outside have higher priority than cfgs in _BASE_
|
||||
if self.BASE_KEY in file_cfg:
|
||||
all_base_cfg = dict()
|
||||
base_ymls = list(file_cfg[self.BASE_KEY])
|
||||
for base_yml in base_ymls:
|
||||
with open(base_yml) as f:
|
||||
base_cfg = self._load_config_with_base(base_yml)
|
||||
all_base_cfg = _merge_dict(all_base_cfg, base_cfg)
|
||||
|
||||
del file_cfg[self.BASE_KEY]
|
||||
file_cfg = _merge_dict(all_base_cfg, file_cfg)
|
||||
file_cfg["filename"] = os.path.splitext(os.path.split(file_path)[-1])[0]
|
||||
return file_cfg
|
||||
|
||||
def merge_dict(self, args):
|
||||
self.cfg = _merge_dict(self.cfg, args)
|
||||
|
||||
def print_cfg(self, print_func=print):
|
||||
"""
|
||||
Recursively visualize a dict and
|
||||
indenting according by the relationship of keys.
|
||||
"""
|
||||
print_func("----------- Config -----------")
|
||||
print_dict(self.cfg, print_func)
|
||||
print_func("---------------------------------------------")
|
||||
|
||||
def save(self, p):
|
||||
with open(p, "w") as f:
|
||||
yaml.dump(dict(self.cfg), f, default_flow_style=False, sort_keys=False)
|
||||
|
||||
|
||||
class ArgsParser(ArgumentParser):
|
||||
def __init__(self):
|
||||
super(ArgsParser, self).__init__(formatter_class=RawDescriptionHelpFormatter)
|
||||
self.add_argument("-c", "--config_file", help="configuration file to use")
|
||||
self.add_argument("-o", "--opt", nargs="*", help="set configuration options")
|
||||
self.add_argument(
|
||||
"-p",
|
||||
"--profiler_options",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The option of profiler, which should be in format "
|
||||
'"key1=value1;key2=value2;key3=value3".',
|
||||
)
|
||||
|
||||
def parse_args(self, argv=None):
|
||||
args = super(ArgsParser, self).parse_args(argv)
|
||||
assert (
|
||||
args.config_file is not None
|
||||
), "Please specify --config_file=configure_file_path."
|
||||
args.opt = self._parse_opt(args.opt)
|
||||
return args
|
||||
|
||||
def _parse_opt(self, opts):
|
||||
config = {}
|
||||
if not opts:
|
||||
return config
|
||||
for s in opts:
|
||||
s = s.strip()
|
||||
k, v = s.split("=", 1)
|
||||
if "." not in k:
|
||||
config[k] = yaml.load(v, Loader=yaml.Loader)
|
||||
else:
|
||||
keys = k.split(".")
|
||||
if keys[0] not in config:
|
||||
config[keys[0]] = {}
|
||||
cur = config[keys[0]]
|
||||
for idx, key in enumerate(keys[1:]):
|
||||
if idx == len(keys) - 2:
|
||||
cur[key] = yaml.load(v, Loader=yaml.Loader)
|
||||
else:
|
||||
cur[key] = {}
|
||||
cur = cur[key]
|
||||
return config
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
img = np.zeros((1, 3, 640, 640))
|
||||
show_img(img[0][0])
|
||||
plt.show()
|
||||
354
PaddleOCR-3.1.0/benchmark/analysis.py
Normal file
354
PaddleOCR-3.1.0/benchmark/analysis.py
Normal file
@ -0,0 +1,354 @@
|
||||
# copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--filename", type=str, help="The name of log which need to analysis."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log_with_profiler", type=str, help="The path of train log with profiler"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profiler_path", type=str, help="The path of profiler timeline log."
|
||||
)
|
||||
parser.add_argument("--keyword", type=str, help="Keyword to specify analysis data")
|
||||
parser.add_argument(
|
||||
"--separator",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Separator of different field in log",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--position", type=int, default=None, help="The position of data field"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--range", type=str, default="", help="The range of data field to intercept"
|
||||
)
|
||||
parser.add_argument("--base_batch_size", type=int, help="base_batch size on gpu")
|
||||
parser.add_argument(
|
||||
"--skip_steps", type=int, default=0, help="The number of steps to be skipped"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_mode", type=int, default=-1, help="Analysis mode, default value is -1"
|
||||
)
|
||||
parser.add_argument("--ips_unit", type=str, default=None, help="IPS unit")
|
||||
parser.add_argument(
|
||||
"--model_name",
|
||||
type=str,
|
||||
default=0,
|
||||
help="training model_name, transformer_base",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--mission_name", type=str, default=0, help="training mission name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--direction_id", type=int, default=0, help="training direction_id"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--run_mode", type=str, default="sp", help="multi process or single process"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index",
|
||||
type=int,
|
||||
default=1,
|
||||
help="{1: speed, 2:mem, 3:profiler, 6:max_batch_size}",
|
||||
)
|
||||
parser.add_argument("--gpu_num", type=int, default=1, help="nums of training gpus")
|
||||
args = parser.parse_args()
|
||||
args.separator = None if args.separator == "None" else args.separator
|
||||
return args
|
||||
|
||||
|
||||
def _is_number(num):
|
||||
pattern = re.compile(r"^[-+]?[-0-9]\d*\.\d*|[-+]?\.?[0-9]\d*$")
|
||||
result = pattern.match(num)
|
||||
if result:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
class TimeAnalyzer(object):
|
||||
def __init__(
|
||||
self, filename, keyword=None, separator=None, position=None, range="-1"
|
||||
):
|
||||
if filename is None:
|
||||
raise Exception("Please specify the filename!")
|
||||
|
||||
if keyword is None:
|
||||
raise Exception("Please specify the keyword!")
|
||||
|
||||
self.filename = filename
|
||||
self.keyword = keyword
|
||||
self.separator = separator
|
||||
self.position = position
|
||||
self.range = range
|
||||
self.records = None
|
||||
self._distil()
|
||||
|
||||
def _distil(self):
|
||||
self.records = []
|
||||
with open(self.filename, "r") as f_object:
|
||||
lines = f_object.readlines()
|
||||
for line in lines:
|
||||
if self.keyword not in line:
|
||||
continue
|
||||
try:
|
||||
result = None
|
||||
|
||||
# Distil the string from a line.
|
||||
line = line.strip()
|
||||
line_words = (
|
||||
line.split(self.separator) if self.separator else line.split()
|
||||
)
|
||||
if args.position:
|
||||
result = line_words[self.position]
|
||||
else:
|
||||
# Distil the string following the keyword.
|
||||
for i in range(len(line_words) - 1):
|
||||
if line_words[i] == self.keyword:
|
||||
result = line_words[i + 1]
|
||||
break
|
||||
|
||||
# Distil the result from the picked string.
|
||||
if not self.range:
|
||||
result = result[0:]
|
||||
elif _is_number(self.range):
|
||||
result = result[0 : int(self.range)]
|
||||
else:
|
||||
result = result[
|
||||
int(self.range.split(":")[0]) : int(
|
||||
self.range.split(":")[1]
|
||||
)
|
||||
]
|
||||
self.records.append(float(result))
|
||||
except Exception as exc:
|
||||
print(
|
||||
"line is: {}; separator={}; position={}".format(
|
||||
line, self.separator, self.position
|
||||
)
|
||||
)
|
||||
|
||||
print(
|
||||
"Extract {} records: separator={}; position={}".format(
|
||||
len(self.records), self.separator, self.position
|
||||
)
|
||||
)
|
||||
|
||||
def _get_fps(self, mode, batch_size, gpu_num, avg_of_records, run_mode, unit=None):
|
||||
if mode == -1 and run_mode == "sp":
|
||||
assert unit, "Please set the unit when mode is -1."
|
||||
fps = gpu_num * avg_of_records
|
||||
elif mode == -1 and run_mode == "mp":
|
||||
assert unit, "Please set the unit when mode is -1."
|
||||
fps = gpu_num * avg_of_records # temporarily, not used now
|
||||
print("------------this is mp")
|
||||
elif mode == 0:
|
||||
# s/step -> samples/s
|
||||
fps = (batch_size * gpu_num) / avg_of_records
|
||||
unit = "samples/s"
|
||||
elif mode == 1:
|
||||
# steps/s -> steps/s
|
||||
fps = avg_of_records
|
||||
unit = "steps/s"
|
||||
elif mode == 2:
|
||||
# s/step -> steps/s
|
||||
fps = 1 / avg_of_records
|
||||
unit = "steps/s"
|
||||
elif mode == 3:
|
||||
# steps/s -> samples/s
|
||||
fps = batch_size * gpu_num * avg_of_records
|
||||
unit = "samples/s"
|
||||
elif mode == 4:
|
||||
# s/epoch -> s/epoch
|
||||
fps = avg_of_records
|
||||
unit = "s/epoch"
|
||||
else:
|
||||
ValueError("Unsupported analysis mode.")
|
||||
|
||||
return fps, unit
|
||||
|
||||
def analysis(
|
||||
self, batch_size, gpu_num=1, skip_steps=0, mode=-1, run_mode="sp", unit=None
|
||||
):
|
||||
if batch_size <= 0:
|
||||
print("base_batch_size should larger than 0.")
|
||||
return 0, ""
|
||||
|
||||
if (
|
||||
len(self.records) <= skip_steps
|
||||
): # to address the condition which item of log equals to skip_steps
|
||||
print("no records")
|
||||
return 0, ""
|
||||
|
||||
sum_of_records = 0
|
||||
sum_of_records_skipped = 0
|
||||
skip_min = self.records[skip_steps]
|
||||
skip_max = self.records[skip_steps]
|
||||
|
||||
count = len(self.records)
|
||||
for i in range(count):
|
||||
sum_of_records += self.records[i]
|
||||
if i >= skip_steps:
|
||||
sum_of_records_skipped += self.records[i]
|
||||
if self.records[i] < skip_min:
|
||||
skip_min = self.records[i]
|
||||
if self.records[i] > skip_max:
|
||||
skip_max = self.records[i]
|
||||
|
||||
avg_of_records = sum_of_records / float(count)
|
||||
avg_of_records_skipped = sum_of_records_skipped / float(count - skip_steps)
|
||||
|
||||
fps, fps_unit = self._get_fps(
|
||||
mode, batch_size, gpu_num, avg_of_records, run_mode, unit
|
||||
)
|
||||
fps_skipped, _ = self._get_fps(
|
||||
mode, batch_size, gpu_num, avg_of_records_skipped, run_mode, unit
|
||||
)
|
||||
if mode == -1:
|
||||
print("average ips of %d steps, skip 0 step:" % count)
|
||||
print("\tAvg: %.3f %s" % (avg_of_records, fps_unit))
|
||||
print("\tFPS: %.3f %s" % (fps, fps_unit))
|
||||
if skip_steps > 0:
|
||||
print("average ips of %d steps, skip %d steps:" % (count, skip_steps))
|
||||
print("\tAvg: %.3f %s" % (avg_of_records_skipped, fps_unit))
|
||||
print("\tMin: %.3f %s" % (skip_min, fps_unit))
|
||||
print("\tMax: %.3f %s" % (skip_max, fps_unit))
|
||||
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
|
||||
elif mode == 1 or mode == 3:
|
||||
print("average latency of %d steps, skip 0 step:" % count)
|
||||
print("\tAvg: %.3f steps/s" % avg_of_records)
|
||||
print("\tFPS: %.3f %s" % (fps, fps_unit))
|
||||
if skip_steps > 0:
|
||||
print(
|
||||
"average latency of %d steps, skip %d steps:" % (count, skip_steps)
|
||||
)
|
||||
print("\tAvg: %.3f steps/s" % avg_of_records_skipped)
|
||||
print("\tMin: %.3f steps/s" % skip_min)
|
||||
print("\tMax: %.3f steps/s" % skip_max)
|
||||
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
|
||||
elif mode == 0 or mode == 2:
|
||||
print("average latency of %d steps, skip 0 step:" % count)
|
||||
print("\tAvg: %.3f s/step" % avg_of_records)
|
||||
print("\tFPS: %.3f %s" % (fps, fps_unit))
|
||||
if skip_steps > 0:
|
||||
print(
|
||||
"average latency of %d steps, skip %d steps:" % (count, skip_steps)
|
||||
)
|
||||
print("\tAvg: %.3f s/step" % avg_of_records_skipped)
|
||||
print("\tMin: %.3f s/step" % skip_min)
|
||||
print("\tMax: %.3f s/step" % skip_max)
|
||||
print("\tFPS: %.3f %s" % (fps_skipped, fps_unit))
|
||||
|
||||
return round(fps_skipped, 3), fps_unit
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
run_info = dict()
|
||||
run_info["log_file"] = args.filename
|
||||
run_info["model_name"] = args.model_name
|
||||
run_info["mission_name"] = args.mission_name
|
||||
run_info["direction_id"] = args.direction_id
|
||||
run_info["run_mode"] = args.run_mode
|
||||
run_info["index"] = args.index
|
||||
run_info["gpu_num"] = args.gpu_num
|
||||
run_info["FINAL_RESULT"] = 0
|
||||
run_info["JOB_FAIL_FLAG"] = 0
|
||||
|
||||
try:
|
||||
if args.index == 1:
|
||||
if args.gpu_num == 1:
|
||||
run_info["log_with_profiler"] = args.log_with_profiler
|
||||
run_info["profiler_path"] = args.profiler_path
|
||||
analyzer = TimeAnalyzer(
|
||||
args.filename, args.keyword, args.separator, args.position, args.range
|
||||
)
|
||||
run_info["FINAL_RESULT"], run_info["UNIT"] = analyzer.analysis(
|
||||
batch_size=args.base_batch_size,
|
||||
gpu_num=args.gpu_num,
|
||||
skip_steps=args.skip_steps,
|
||||
mode=args.model_mode,
|
||||
run_mode=args.run_mode,
|
||||
unit=args.ips_unit,
|
||||
)
|
||||
try:
|
||||
if (
|
||||
int(os.getenv("job_fail_flag")) == 1
|
||||
or int(run_info["FINAL_RESULT"]) == 0
|
||||
):
|
||||
run_info["JOB_FAIL_FLAG"] = 1
|
||||
except:
|
||||
pass
|
||||
elif args.index == 3:
|
||||
run_info["FINAL_RESULT"] = {}
|
||||
records_fo_total = TimeAnalyzer(
|
||||
args.filename, "Framework overhead", None, 3, ""
|
||||
).records
|
||||
records_fo_ratio = TimeAnalyzer(
|
||||
args.filename, "Framework overhead", None, 5
|
||||
).records
|
||||
records_ct_total = TimeAnalyzer(
|
||||
args.filename, "Computation time", None, 3, ""
|
||||
).records
|
||||
records_gm_total = TimeAnalyzer(
|
||||
args.filename, "GpuMemcpy Calls", None, 4, ""
|
||||
).records
|
||||
records_gm_ratio = TimeAnalyzer(
|
||||
args.filename, "GpuMemcpy Calls", None, 6
|
||||
).records
|
||||
records_gmas_total = TimeAnalyzer(
|
||||
args.filename, "GpuMemcpyAsync Calls", None, 4, ""
|
||||
).records
|
||||
records_gms_total = TimeAnalyzer(
|
||||
args.filename, "GpuMemcpySync Calls", None, 4, ""
|
||||
).records
|
||||
run_info["FINAL_RESULT"]["Framework_Total"] = (
|
||||
records_fo_total[0] if records_fo_total else 0
|
||||
)
|
||||
run_info["FINAL_RESULT"]["Framework_Ratio"] = (
|
||||
records_fo_ratio[0] if records_fo_ratio else 0
|
||||
)
|
||||
run_info["FINAL_RESULT"]["ComputationTime_Total"] = (
|
||||
records_ct_total[0] if records_ct_total else 0
|
||||
)
|
||||
run_info["FINAL_RESULT"]["GpuMemcpy_Total"] = (
|
||||
records_gm_total[0] if records_gm_total else 0
|
||||
)
|
||||
run_info["FINAL_RESULT"]["GpuMemcpy_Ratio"] = (
|
||||
records_gm_ratio[0] if records_gm_ratio else 0
|
||||
)
|
||||
run_info["FINAL_RESULT"]["GpuMemcpyAsync_Total"] = (
|
||||
records_gmas_total[0] if records_gmas_total else 0
|
||||
)
|
||||
run_info["FINAL_RESULT"]["GpuMemcpySync_Total"] = (
|
||||
records_gms_total[0] if records_gms_total else 0
|
||||
)
|
||||
else:
|
||||
print("Not support!")
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
print(
|
||||
"{}".format(json.dumps(run_info))
|
||||
) # it's required, for the log file path insert to the database
|
||||
30
PaddleOCR-3.1.0/benchmark/readme.md
Normal file
30
PaddleOCR-3.1.0/benchmark/readme.md
Normal file
@ -0,0 +1,30 @@
|
||||
|
||||
# PaddleOCR DB/EAST/PSE 算法训练benchmark测试
|
||||
|
||||
PaddleOCR/benchmark目录下的文件用于获取并分析训练日志。
|
||||
训练采用icdar2015数据集,包括1000张训练图像和500张测试图像。模型配置采用resnet18_vd作为backbone,分别训练batch_size=8和batch_size=16的情况。
|
||||
|
||||
## 运行训练benchmark
|
||||
|
||||
benchmark/run_det.sh 中包含了三个过程:
|
||||
- 安装依赖
|
||||
- 下载数据
|
||||
- 执行训练
|
||||
- 日志分析获取IPS
|
||||
|
||||
在执行训练部分,会执行单机单卡(默认0号卡)单机多卡训练,并分别执行batch_size=8和batch_size=16的情况。所以执行完后,每种模型会得到4个日志文件。
|
||||
|
||||
run_det.sh 执行方式如下:
|
||||
|
||||
```
|
||||
# cd PaddleOCR/
|
||||
bash benchmark/run_det.sh
|
||||
```
|
||||
|
||||
以DB为例,将得到四个日志文件,如下:
|
||||
```
|
||||
det_res18_db_v2.0_sp_bs16_fp32_1
|
||||
det_res18_db_v2.0_sp_bs8_fp32_1
|
||||
det_res18_db_v2.0_mp_bs16_fp32_1
|
||||
det_res18_db_v2.0_mp_bs8_fp32_1
|
||||
```
|
||||
61
PaddleOCR-3.1.0/benchmark/run_benchmark_det.sh
Normal file
61
PaddleOCR-3.1.0/benchmark/run_benchmark_det.sh
Normal file
@ -0,0 +1,61 @@
|
||||
#!/usr/bin/env bash
|
||||
# 运行示例:CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
|
||||
# 参数说明
|
||||
function _set_params(){
|
||||
run_mode=${1:-"sp"} # 单卡sp|多卡mp
|
||||
batch_size=${2:-"64"}
|
||||
fp_item=${3:-"fp32"} # fp32|fp16
|
||||
max_epoch=${4:-"10"} # 可选,如果需要修改代码提前中断
|
||||
model_item=${5:-"model_item"}
|
||||
run_log_path=${TRAIN_LOG_DIR:-$(pwd)} # TRAIN_LOG_DIR 后续QA设置该参数
|
||||
# 日志解析所需参数
|
||||
base_batch_size=${batch_size}
|
||||
mission_name="OCR"
|
||||
direction_id="0"
|
||||
ips_unit="images/sec"
|
||||
skip_steps=2 # 解析日志,有些模型前几个step耗时长,需要跳过 (必填)
|
||||
keyword="ips:" # 解析日志,筛选出数据所在行的关键字 (必填)
|
||||
index="1"
|
||||
model_name=${model_item}_bs${batch_size}_${fp_item} # model_item 用于yml文件名匹配,model_name 用于数据入库前端展示
|
||||
# 以下不用修改
|
||||
device=${CUDA_VISIBLE_DEVICES//,/ }
|
||||
arr=(${device})
|
||||
num_gpu_devices=${#arr[*]}
|
||||
log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
|
||||
}
|
||||
function _train(){
|
||||
echo "Train on ${num_gpu_devices} GPUs"
|
||||
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
|
||||
|
||||
train_cmd="-c configs/det/${model_item}.yml -o Train.loader.batch_size_per_card=${batch_size} Global.epoch_num=${max_epoch} Global.eval_batch_step=[0,20000] Global.print_batch_step=2"
|
||||
case ${run_mode} in
|
||||
sp)
|
||||
train_cmd="python tools/train.py "${train_cmd}""
|
||||
;;
|
||||
mp)
|
||||
rm -rf ./mylog
|
||||
train_cmd="python -m paddle.distributed.launch --log_dir=./mylog --gpus=$CUDA_VISIBLE_DEVICES tools/train.py ${train_cmd}"
|
||||
;;
|
||||
*) echo "choose run_mode(sp or mp)"; exit 1;
|
||||
esac
|
||||
# 以下不用修改
|
||||
echo ${train_cmd}
|
||||
timeout 15m ${train_cmd} > ${log_file} 2>&1
|
||||
if [ $? -ne 0 ];then
|
||||
echo -e "${model_name}, FAIL"
|
||||
export job_fail_flag=1
|
||||
else
|
||||
echo -e "${model_name}, SUCCESS"
|
||||
export job_fail_flag=0
|
||||
fi
|
||||
|
||||
if [ $run_mode = "mp" -a -d mylog ]; then
|
||||
rm ${log_file}
|
||||
cp mylog/workerlog.0 ${log_file}
|
||||
fi
|
||||
}
|
||||
|
||||
source ${BENCHMARK_ROOT}/scripts/run_model.sh # 在该脚本中会对符合benchmark规范的log使用analysis.py 脚本进行性能数据解析;该脚本在连调时可从benchmark repo中下载https://github.com/PaddlePaddle/benchmark/blob/master/scripts/run_model.sh;如果不联调只想要产出训练log可以注掉本行,提交时需打开
|
||||
_set_params $@
|
||||
#_train # 如果只想产出训练log,不解析,可取消注释
|
||||
_run # 该函数在run_model.sh中,执行时会调用_train; 如果不联调只想要产出训练log可以注掉本行,提交时需打开
|
||||
36
PaddleOCR-3.1.0/benchmark/run_det.sh
Normal file
36
PaddleOCR-3.1.0/benchmark/run_det.sh
Normal file
@ -0,0 +1,36 @@
|
||||
#!/bin/bash
|
||||
# 提供可稳定复现性能的脚本,默认在标准docker环境内py37执行: paddlepaddle/paddle:latest-gpu-cuda10.1-cudnn7 paddle=2.1.2 py=37
|
||||
# 执行目录: ./PaddleOCR
|
||||
# 1 安装该模型需要的依赖 (如需开启优化策略请注明)
|
||||
log_path=${LOG_PATH_INDEX_DIR:-$(pwd)}
|
||||
python -m pip install -r requirements.txt
|
||||
# 2 拷贝该模型需要数据、预训练模型
|
||||
wget -P ./train_data/ https://paddleocr.bj.bcebos.com/dygraph_v2.0/test/icdar2015.tar && cd train_data && tar xf icdar2015.tar && cd ../
|
||||
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_pretrained.pdparams
|
||||
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet18_vd_pretrained.pdparams
|
||||
wget -P ./pretrain_models/ https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/ResNet50_vd_ssld_pretrained.pdparams
|
||||
# 3 批量运行(如不方便批量,1,2需放到单个模型中)
|
||||
|
||||
model_mode_list=(det_res18_db_v2.0 det_r50_vd_east det_r50_vd_pse)
|
||||
fp_item_list=(fp32)
|
||||
for model_mode in ${model_mode_list[@]}; do
|
||||
for fp_item in ${fp_item_list[@]}; do
|
||||
if [ ${model_mode} == "det_r50_vd_east" ]; then
|
||||
bs_list=(16)
|
||||
else
|
||||
bs_list=(8 16)
|
||||
fi
|
||||
for bs_item in ${bs_list[@]}; do
|
||||
echo "index is speed, 1gpus, begin, ${model_name}"
|
||||
run_mode=sp
|
||||
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
|
||||
CUDA_VISIBLE_DEVICES=0 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 1 ${model_mode} | tee ${log_path}/${log_name}_speed_1gpus 2>&1 # (5min)
|
||||
sleep 60
|
||||
echo "index is speed, 8gpus, run_mode is multi_process, begin, ${model_name}"
|
||||
run_mode=mp
|
||||
log_name=ocr_${model_mode}_bs${bs_item}_${fp_item}_${run_mode}
|
||||
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash benchmark/run_benchmark_det.sh ${run_mode} ${bs_item} ${fp_item} 2 ${model_mode} | tee ${log_path}/${log_name}_speed_8gpus8p 2>&1
|
||||
sleep 60
|
||||
done
|
||||
done
|
||||
done
|
||||
@ -0,0 +1,98 @@
|
||||
Global:
|
||||
debug: false
|
||||
use_gpu: true
|
||||
epoch_num: 100
|
||||
log_smooth_window: 20
|
||||
print_batch_step: 10
|
||||
save_model_dir: ./output/rec_ppocr_v3_rotnet
|
||||
save_epoch_step: 3
|
||||
eval_batch_step: [0, 2000]
|
||||
cal_metric_during_train: true
|
||||
pretrained_model: null
|
||||
checkpoints: null
|
||||
save_inference_dir: null
|
||||
use_visualdl: false
|
||||
infer_img: doc/imgs_words/ch/word_1.jpg
|
||||
character_dict_path: ppocr/utils/ppocr_keys_v1.txt
|
||||
max_text_length: 25
|
||||
infer_mode: false
|
||||
use_space_char: true
|
||||
save_res_path: ./output/rec/predicts_chinese_lite_v2.0.txt
|
||||
Optimizer:
|
||||
name: Adam
|
||||
beta1: 0.9
|
||||
beta2: 0.999
|
||||
lr:
|
||||
name: Cosine
|
||||
learning_rate: 0.001
|
||||
regularizer:
|
||||
name: L2
|
||||
factor: 1.0e-05
|
||||
Architecture:
|
||||
model_type: cls
|
||||
algorithm: CLS
|
||||
Transform: null
|
||||
Backbone:
|
||||
name: MobileNetV1Enhance
|
||||
scale: 0.5
|
||||
last_conv_stride: [1, 2]
|
||||
last_pool_type: avg
|
||||
Neck:
|
||||
Head:
|
||||
name: ClsHead
|
||||
class_dim: 4
|
||||
|
||||
Loss:
|
||||
name: ClsLoss
|
||||
main_indicator: acc
|
||||
|
||||
PostProcess:
|
||||
name: ClsPostProcess
|
||||
|
||||
Metric:
|
||||
name: ClsMetric
|
||||
main_indicator: acc
|
||||
|
||||
Train:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data
|
||||
label_file_list:
|
||||
- ./train_data/train_list.txt
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
img_mode: BGR
|
||||
channel_first: false
|
||||
- BaseDataAugmentation:
|
||||
- RandAugment:
|
||||
- SSLRotateResize:
|
||||
image_shape: [3, 48, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ["image", "label"]
|
||||
loader:
|
||||
collate_fn: "SSLRotateCollate"
|
||||
shuffle: true
|
||||
batch_size_per_card: 32
|
||||
drop_last: true
|
||||
num_workers: 8
|
||||
Eval:
|
||||
dataset:
|
||||
name: SimpleDataSet
|
||||
data_dir: ./train_data
|
||||
label_file_list:
|
||||
- ./train_data/val_list.txt
|
||||
transforms:
|
||||
- DecodeImage:
|
||||
img_mode: BGR
|
||||
channel_first: false
|
||||
- SSLRotateResize:
|
||||
image_shape: [3, 48, 320]
|
||||
- KeepKeys:
|
||||
keep_keys: ["image", "label"]
|
||||
loader:
|
||||
collate_fn: "SSLRotateCollate"
|
||||
shuffle: false
|
||||
drop_last: false
|
||||
batch_size_per_card: 64
|
||||
num_workers: 8
|
||||
profiler_options: null
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user