From fbd3b07ca4062c335e0cacfbfb81cc39c74b311a Mon Sep 17 00:00:00 2001 From: Mountchciken Date: Sat, 7 Jan 2023 19:15:33 +0800 Subject: [PATCH 1/2] add 10 papers --- ...al Network for Scene Text Recognition.yaml | 75 +++++++++++++++++ ...al Network for Scene Text Recognition.yaml | 76 +++++++++++++++++ ...ence Model For Scene Text Recognition.yaml | 76 +++++++++++++++++ ...t Recognizer without Human Annotation.yaml | 75 +++++++++++++++++ ...nal Clues for Robust Text Recognition.yaml | 82 +++++++++++++++++++ ...ecognition with a Single Visual Model.yaml | 74 +++++++++++++++++ ...rmuted Autoregressive Sequence Models.yaml | 71 ++++++++++++++++ ...ter Distillation for Text Recognition.yaml | 79 ++++++++++++++++++ ...ransformer for Scene Text Recognition.yaml | 79 ++++++++++++++++++ ...ition via Training Protocol Searching.yaml | 74 +++++++++++++++++ 10 files changed, 761 insertions(+) create mode 100644 paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml create mode 100644 paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml create mode 100644 paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml create mode 100644 paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml create mode 100644 paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml create mode 100644 paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml create mode 100644 paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml create mode 100644 paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml create mode 100644 paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml create mode 100644 paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml diff --git a/paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml b/paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml new file mode 100644 index 000000000..e79162f3d --- /dev/null +++ b/paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml @@ -0,0 +1,75 @@ +Title: 'Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition' +Abbreviation: HammingOCR +Tasks: + - TextRecog +Venue: arXiv +Year: 2020 +Lab/Company: + - School of Computer and Information Technology, Beijing Jiaotong University, China + - Shanghai Collaborative Innovation Center of Intelligent Visual Computing, School of Computer Science, Fudan University, China + - Baidu Inc., China +URL: + Venue: N/A + Arxiv: 'https://arxiv.org/abs/2009.10874' +Paper Reading URL: N/A +Code: N/A +Supported In MMOCR: N/S +PaperType: + - Algorithm +Abstract: 'Recently, inspired by Transformer, self-attention-based scene text +recognition approaches have achieved outstanding performance. However, we find +that the size of model expands rapidly with the lexicon increasing. Specifically, +the number of parameters for softmax classification layer and output embedding +layer are proportional to the vocabulary size. It hinders the development of a +lightweight text recognition model especially applied for Chinese and multiple +languages. Thus, we propose a lightweight scene text recognition model named +Hamming OCR. In this model, a novel Hamming classifier, which adopts locality +sensitive hashing (LSH) algorithm to encode each character, is proposed to +replace the softmax regression and the generated LSH code is directly employed +to replace the output embedding. We also present a simplified transformer +decoder to reduce the number of parameters by removing the feed-forward network +and using cross-layer parameter sharing technique. Compared with traditional +methods, the number of parameters in both classification and embedding layers +is independent on the size of vocabulary, which significantly reduces the +storage requirement without loss of accuracy. Experimental results on several +datasets, including four public benchmaks and a Chinese text dataset synthesized +by SynthText with more than 20,000 characters, shows that Hamming OCR achieves +competitive results.' +MODELS: + Architecture: + - Transformer + Learning Method: + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211144293-2f94c36f-a3ec-44ac-a70c-4854ccfa90af.png' + FPS: + DEVICE: N/A + ITEM: N/A + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: 6.6M + Experiment: + Training DataSets: + - MJ + Test DataSets: + Avg.: 74.0 + IIIT5K: + WAICS: 82.6 + SVT: + WAICS: 83.3 + IC13: + WAICS: N/A + IC15: + WAICS: N/A + SVTP: + WAICS: 68.8 + CUTE: + WAICS: 61.1 +Bibtex: '@article{li2020hamming, + title={Hamming ocr: A locality sensitive hashing neural network for scene text recognition}, + author={Li, Bingcong and Tang, Xin and Qi, Xianbiao and Chen, Yihao and Xiao, Rong}, + journal={arXiv preprint arXiv:2009.10874}, + year={2020} +}' diff --git a/paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml b/paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml new file mode 100644 index 000000000..a605f6c81 --- /dev/null +++ b/paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml @@ -0,0 +1,76 @@ +Title: 'MASTER: Multi-Aspect Non-local Network for Scene Text Recognition' +Abbreviation: MASTER +Tasks: + - TextRecog +Venue: PR +Year: 2021 +Lab/Company: + - School of Computer and Information Technology, Beijing Jiaotong University, China + - Shanghai Collaborative Innovation Center of Intelligent Visual Computing, School of Computer Science, Fudan University, China + - Baidu Inc., China +URL: + Venue: 'https://www.sciencedirect.com/science/article/pii/S0031320321001679' + Arxiv: 'https://arxiv.org/abs/1910.02562' +Paper Reading URL: N/A +Code: 'https://github.com/wenwenyu/MASTER-pytorch' +Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/master' +PaperType: + - Algorithm +Abstract: 'Attention-based scene text recognizers have gained huge success, which +leverages a more compact intermediate representation to learn 1d- or 2d- attention +by a RNN-based encoder-decoder architecture. However, such methods suffer +from attention-drift problem because high similarity among encoded features +leads to attention confusion under the RNN-based local attention mechanism. +Moreover, RNN-based methods have low efficiency due to poor parallelization. +To overcome these problems, we propose the MASTER, a self-attention based scene +text recognizer that (1) not only encodes the input-output attention but also +learns self-attention which encodes feature-feature and target-target relationships +inside the encoder and decoder and (2) learns a more powerful and robust +intermediate representation to spatial distortion, and (3) owns a great training +efficiency because of high training parallelization and a high-speed inference +because of an efficient memory-cache mechanism. Extensive experiments on various +benchmarks demonstrate the superior performance of our MASTER on both regular +and irregular scene text. Pytorch code can be found at https://github.com/wenwenyu/MASTER-pytorch, +and Tensorflow code can be found at https://github.com/jiangxiluning/MASTER-TF.' +MODELS: + Architecture: + - Transformer + Learning Method: + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211144560-9732023f-fb02-415e-abfe-0b0ff0ab8425.png' + FPS: + DEVICE: 'NVIDIA 1080Ti' + ITEM: 55.5 + FLOPS: + DEVICE: 'NVIDIA 1080Ti' + ITEM: 6.07G + PARAMS: 38.81M + Experiment: + Training DataSets: + - MJ + - ST + Test DataSets: + Avg.: 88.7 + IIIT5K: + WAICS: 95.0 + SVT: + WAICS: 90.6 + IC13: + WAICS: 95.3 + IC15: + WAICS: 79.4 + SVTP: + WAICS: 84.5 + CUTE: + WAICS: 87.5 +Bibtex: '@article{lu2021master, + title={Master: Multi-aspect non-local network for scene text recognition}, + author={Lu, Ning and Yu, Wenwen and Qi, Xianbiao and Chen, Yihao and Gong, Ping and Xiao, Rong and Bai, Xiang}, + journal={Pattern Recognition}, + volume={117}, + pages={107980}, + year={2021}, + publisher={Elsevier} +}' diff --git a/paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml b/paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml new file mode 100644 index 000000000..f59546d76 --- /dev/null +++ b/paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml @@ -0,0 +1,76 @@ +Title: 'NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition' +Abbreviation: NRTR +Tasks: + - TextRecog +Venue: ICDAR +Year: 2019 +Lab/Company: + - Institute of Automation, Chinese Academy of Sciences University of Chinese Academy of Sciences +URL: + Venue: 'https://ieeexplore.ieee.org/abstract/document/8978180/' + Arxiv: 'https://arxiv.org/abs/1806.00926' +Paper Reading URL: N/A +Code: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/nrtr' +Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/nrtr' +PaperType: + - Algorithm +Abstract: 'Scene text recognition has attracted a great many researches due to +its importance to various applications. Existing methods mainly adopt recurrence +or convolution based networks. Though have obtained good performance, these +methods still suffer from two limitations: slow training speed due to the +internal recurrence of RNNs, and high complexity due to stacked convolutional +layers for long-term feature extraction. This paper, for the first time, +proposes a no-recurrence sequence-to-sequence text recognizer, named NRTR, that +dispenses with recurrences and convolutions entirely. NRTR follows the +encoder-decoder paradigm, where the encoder uses stacked self-attention to +extract image features, and the decoder applies stacked self-attention to +recognize texts based on encoder output. NRTR relies solely on self-attention +mechanism thus could be trained with more parallelization and less complexity. +Considering scene image has large variation in text and background, we further +design a modality-transform block to effectively transform 2D input images to +1D sequences, combined with the encoder to extract more discriminative features. + NRTR achieves state-of-the-art or highly competitive performance on both + regular and irregular benchmarks, while requires only a small fraction of + training time compared to the best model from the literature (at least 8 + times faster).' +MODELS: + Architecture: + - Transformer + Learning Method: + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211147170-f8ceb124-cde4-4323-b770-493962cdfcb0.png' + FPS: + DEVICE: N/A + ITEM: N/A + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: N/A + Experiment: + Training DataSets: + - MJ + - ST + Test DataSets: + Avg.: 87.4 + IIIT5K: + WAICS: 90.1 + SVT: + WAICS: 91.5 + IC13: + WAICS: 95.8 + IC15: + WAICS: 79.4 + SVTP: + WAICS: 86.6 + CUTE: + WAICS: 80.9 +Bibtex: '@inproceedings{sheng2019nrtr, + title={NRTR: A no-recurrence sequence-to-sequence model for scene text recognition}, + author={Sheng, Fenfen and Chen, Zhineng and Xu, Bo}, + booktitle={2019 International conference on document analysis and recognition (ICDAR)}, + pages={781--786}, + year={2019}, + organization={IEEE} +}' diff --git a/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml new file mode 100644 index 000000000..14329522b --- /dev/null +++ b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml @@ -0,0 +1,75 @@ +Title: 'Pushing the Performance Limit of Scene Text Recognizer without Human Annotation' +Abbreviation: Zheng et al +Tasks: + - TextRecog +Venue: CVPR +Year: 2022 +Lab/Company: + - School of Computer Science and Ningbo Institute, Northwestern Polytechnical University, China + - Samsung Advanced Institute of Technology (SAIT), South Korea +URL: + Venue: 'https://openaccess.thecvf.com/content/CVPR2022/html/Zheng_Pushing_the_Performance_Limit_of_Scene_Text_Recognizer_Without_Human_CVPR_2022_paper.html' + Arxiv: 'https://arxiv.org/abs/2204.07714' +Paper Reading URL: N/A +Code: N/A +Supported In MMOCR: N/S +PaperType: + - Algorithm +Abstract: 'Scene text recognition (STR) attracts much attention over the years +because of its wide application. Most methods train STR model in a fully +supervised manner which requires large amounts of labeled data. Although +synthetic data contributes a lot to STR, it suffers from the real-tosynthetic +domain gap that restricts model performance. In this work, we aim to boost +STR models by leveraging both synthetic data and the numerous real unlabeled +images, exempting human annotation cost thoroughly. A robust consistency +regularization based semi-supervised framework is proposed for STR, which can +effectively solve the instability issue due to domain inconsistency between +synthetic and real images. A character-level consistency regularization is +designed to mitigate the misalignment between characters in sequence recognition. +Extensive experiments on standard text recognition benchmarks demonstrate +the effectiveness of the proposed method. It can steadily improve existing +STR models, and boost an STR model to achieve new state-of-the-art results. +To our best knowledge, this is the first consistency regularization based +framework that applies successfully to STR.' +MODELS: + Architecture: + - Attenion + Learning Method: + - Self Supervised + - Semi Supervised + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211144099-f6db366a-e34b-401d-9b3c-13d08c1c1068.png' + FPS: + DEVICE: N/A + ITEM: N/A + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: N/A + Experiment: + Training DataSets: + - MJ + - ST + Test DataSets: + Avg.: 94.5 + IIIT5K: + WAICS: 96.5 + SVT: + WAICS: 96.3 + IC13: + WAICS: 98.3 + IC15: + WAICS: 89.3 + SVTP: + WAICS: 93.3 + CUTE: + WAICS: 93.4 +Bibtex: '@inproceedings{zheng2022pushing, + title={Pushing the Performance Limit of Scene Text Recognizer without Human Annotation}, + author={Zheng, Caiyuan and Li, Hui and Rhee, Seon-Min and Han, Seungju and Han, Jae-Joon and Wang, Peng}, + booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition}, + pages={14116--14125}, + year={2022} +}' diff --git a/paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml b/paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml new file mode 100644 index 000000000..82f9108eb --- /dev/null +++ b/paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml @@ -0,0 +1,82 @@ +Title: 'RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition' +Abbreviation: RobustScanner +Tasks: + - TextRecog +Venue: ECCV +Year: 2020 +Lab/Company: + - SenseTime Research, Hong Kong, China + - School of Cyber Science and Engineering, Xi’an Jiaotong University, Xi’an, China +URL: + Venue: 'https://link.springer.com/chapter/10.1007/978-3-030-58529-7_9' + Arxiv: 'https://arxiv.org/abs/2007.07542' +Paper Reading URL: N/A +Code: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/robust_scanner' +Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/robust_scanner' +PaperType: + - Algorithm +Abstract: 'The attention-based encoder-decoder framework has recently achieved +impressive results for scene text recognition, and many variants have emerged +with improvements in recognition quality. However, it performs poorly on +contextless texts (e.g., random character sequences) which is unacceptable in +most of real application scenarios. In this paper, we first deeply investigate +the decoding process of the decoder. We empirically find that a representative +character-level sequence decoder utilizes not only context information but also +positional information. Contextual information, which the existing approaches +heavily rely on, causes the problem of attention drift. To suppress such +side-effect, we propose a novel position enhancement branch, and dynamically +fuse its outputs with those of the decoder attention module for scene text +recognition. Specifically, it contains a position aware module to enable the +encoder to output feature vectors encoding their own spatial positions, and an +attention module to estimate glimpses using the positional clue (i.e., the +current decoding time step) only. The dynamic fusion is conducted for more +robust feature via an element-wise gate mechanism. Theoretically, our proposed +method, dubbed RobustScanner, decodes individual characters with dynamic ratio +between context and positional clues, and utilizes more positional ones when +the decoding sequences with scarce context, and thus is robust and practical. +Empirically, it has achieved new state-of-the-art results on popular regular +and irregular text recognition benchmarks while without much performance drop +on contextless benchmarks, validating its robustness in both contextual and +contextless application scenarios.' +MODELS: + Architecture: + - Attention + Learning Method: + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211147345-0515c292-00d1-458f-b5c7-b3a940a0c12c.png' + FPS: + DEVICE: N/A + ITEM: N/A + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: N/A + Experiment: + Training DataSets: + - MJ + - ST + - Real + Test DataSets: + Avg.: 88.9 + IIIT5K: + WAICS: 95.4 + SVT: + WAICS: 89.3 + IC13: + WAICS: 94.1 + IC15: + WAICS: 79.2 + SVTP: + WAICS: 82.9 + CUTE: + WAICS: 92.4 +Bibtex: '@inproceedings{yue2020robustscanner, + title={Robustscanner: Dynamically enhancing positional clues for robust text recognition}, + author={Yue, Xiaoyu and Kuang, Zhanghui and Lin, Chenhao and Sun, Hongbin and Zhang, Wayne}, + booktitle={European Conference on Computer Vision}, + pages={135--151}, + year={2020}, + organization={Springer} +}' diff --git a/paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml b/paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml new file mode 100644 index 000000000..f52858a26 --- /dev/null +++ b/paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml @@ -0,0 +1,74 @@ +Title: 'SVTR: Scene Text Recognition with a Single Visual Model' +Abbreviation: SVTP +Tasks: + - TextRecog +Venue: IJCAI +Year: 2022 +Lab/Company: + - School of Computer and Information Technology, Beijing Jiaotong University, China + - Shanghai Collaborative Innovation Center of Intelligent Visual Computing, School of Computer Science, Fudan University, China + - Baidu Inc., China +URL: + Venue: 'https://www.ijcai.org/proceedings/2022/0124.pdf' + Arxiv: 'https://arxiv.org/abs/2205.00159' +Paper Reading URL: 'https://mp.weixin.qq.com/s/kR2CwHwE78STJiSfwlv9QA' +Code: 'https://github.com/baudm/parseq' +Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/svtr' +PaperType: + - Algorithm +Abstract: 'Dominant scene text recognition models commonly contain two building blocks, +a visual model for feature extraction and a sequence model for text transcription. +This hybrid architecture, although accurate, is complex and less efficient. In +this study, we propose a Single Visual model for Scene Text recognition within +the patch-wise image tokenization framework, which dispenses with the sequential +modeling entirely. The method, termed SVTR, firstly decomposes an image text +into small patches named character components. Afterward, hierarchical stages +are recurrently carried out by component-level mixing, merging and/or combining. +Global and local mixing blocks are devised to perceive the inter-character and +intra-character patterns, leading to a multi-grained character component +perception. Thus, characters are recognized by a simple linear prediction. +Experimental results on both English and Chinese scene text recognition tasks +demonstrate the effectiveness of SVTR. SVTR-L (Large) achieves highly +competitive accuracy in English and outperforms existing methods by a large +margin in Chinese, while running faster. In addition, SVTR-T (Tiny) is an +effective and much smaller model, which shows appealing speed at inference. +The code is publicly available at https://github.com/PaddlePaddle/PaddleOCR.' +MODELS: + Architecture: + - Transformer + Learning Method: + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211143670-0913ccd8-1f5d-407b-8b64-e782f0cb037e.png' + FPS: + DEVICE: 'NVIDIA 1080Ti' + ITEM: 55.5 + FLOPS: + DEVICE: 'NVIDIA 1080Ti' + ITEM: 6.07G + PARAMS: 38.81M + Experiment: + Training DataSets: + - MJ + - ST + Test DataSets: + Avg.: 92.6 + IIIT5K: + WAICS: 96.3 + SVT: + WAICS: 91.7 + IC13: + WAICS: 97.2 + IC15: + WAICS: 86.6 + SVTP: + WAICS: 88.4 + CUTE: + WAICS: 95.1 +Bibtex: '@article{du2022svtr, + title={SVTR: Scene Text Recognition with a Single Visual Model}, + author={Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang}, + journal={arXiv preprint arXiv:2205.00159}, + year={2022} +}' diff --git a/paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml b/paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml new file mode 100644 index 000000000..865619aed --- /dev/null +++ b/paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml @@ -0,0 +1,71 @@ +Title: 'Scene Text Recognition with Permuted Autoregressive Sequence Models' +Abbreviation: PARSeq +Tasks: + - TextRecog +Venue: ECCV +Year: 2022 +Lab/Company: + - Electrical and Electronics Engineering Institute, University of the Philippines, Diliman +URL: + Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_11' + Arxiv: 'https://arxiv.org/abs/2207.06966' +Paper Reading URL: N/A +Code: 'https://github.com/baudm/parseq' +Supported In MMOCR: N/S +PaperType: + - Algorithm +Abstract: 'Context-aware STR methods typically use internal autoregressive (AR) +language models (LM). Inherent limitations of AR models motivated two-stage +methods which employ an external LM. The conditional independence of the external +LM on the input image may cause it to erroneously rectify correct predictions, +leading to significant inefficiencies. Our method, PARSeq, learns an ensemble +of internal AR LMs with shared weights using Permutation Language Modeling. It +unifies context-free non-AR and context-aware AR inference, and iterative +refinement using bidirectional context. Using synthetic training data, PARSeq +achieves state-of-the-art (SOTA) results in STR benchmarks (91.9% accuracy) +and more challenging datasets. It establishes new SOTA results (96.0% accuracy) +when trained on real data. PARSeq is optimal on accuracy vs parameter count, +FLOPS, and latency because of its simple, unified structure and parallel token +processing. Due to its extensive use of attention, it is robust on +arbitrarily-oriented text which is common in real-world images. Code, pretrained +weights, and data are available at: https://github.com/baudm/parseq.' +MODELS: + Architecture: + - Transformer + Learning Method: + - Supervised + Language Modality: + - Explicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211143463-0d347e44-4ea9-4b17-857f-99f6e34378c2.png' + FPS: + DEVICE: N/A + ITEM: N/A + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: N/A + Experiment: + Training DataSets: + - Real + Test DataSets: + Avg.: 96.5 + IIIT5K: + WAICS: 99.1 + SVT: + WAICS: 97.9 + IC13: + WAICS: 98.4 + IC15: + WAICS: 89.6 + SVTP: + WAICS: 95.7 + CUTE: + WAICS: 98.3 +Bibtex: '@inproceedings{bautista2022scene, + title={Scene Text Recognition with Permuted Autoregressive Sequence Models}, + author={Bautista, Darwin and Atienza, Rowel}, + booktitle={European Conference on Computer Vision}, + pages={178--196}, + year={2022}, + organization={Springer} +}' diff --git a/paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml b/paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml new file mode 100644 index 000000000..848b2744b --- /dev/null +++ b/paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml @@ -0,0 +1,79 @@ +Title: 'Self-supervised Character-to-Character Distillation for Text Recognition' +Abbreviation: CCD +Tasks: + - TextRecog +Venue: arXiv +Year: 2022 +Lab/Company: + - MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University +URL: + Venue: N/A + Arxiv: 'https://arxiv.org/abs/2211.00288' +Paper Reading URL: N/A +Code: N/A +Supported In MMOCR: N/S +PaperType: + - Algorithm +Abstract: 'Handling complicated text images (e.g., irregular structures, low +resolution, heavy occlusion, and even illumination), existing supervised text +recognition methods are data-hungry. Although these methods employ large-scale +synthetic text images to reduce the dependence on annotated real images, the +domain gap limits the recognition performance. Therefore, exploring the robust +text feature representation on unlabeled real images by self-supervised learning +is a good solution. However, existing selfsupervised text recognition methods +only execute sequenceto-sequence representation learning by roughly splitting +the visual features along the horizontal axis, which will damage the character +structures. Besides, these sequential-level self-learning methods limit the +availability of geometricbased data augmentation, as large-scale geometry +augmentation leads to sequence-to-sequence inconsistency. To address the +above-mentioned issues, we proposed a novel self-supervised character-to-character +distillation method, CCD. Specifically, we delineate the character structures +of unlabeled real images by designing a self-supervised character segmentation +module, and further apply the segmentation results to build character-level +representation learning. CCD differs from prior works in that we propose a +character-level pretext task to learn more fine-grained feature representations. +Besides, compared with the inflexible augmentations of sequence-to-sequence +models, our work satisfies character-to-character representation consistency, +across various transformations (e.g., geometry and colour), to generate robust +text features in the representative space. Experiments demonstrate that CCD +achieves state-of-the-art performance on publicly available text recognition +benchmarks.' +MODELS: + Architecture: + - Transformer + Learning Method: + - Self-Supervised + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211142941-06500063-59a7-485c-bfd3-817dc367a1a7.png' + FPS: + DEVICE: N/A + ITEM: N/A + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: N/A + Experiment: + Training DataSets: + - Real + Test DataSets: + Avg.: 94.3 + IIIT5K: + WAICS: 97.1 + SVT: + WAICS: 96.0 + IC13: + WAICS: 97.5 + IC15: + WAICS: 87.5 + SVTP: + WAICS: 91.6 + CUTE: + WAICS: 95.8 +Bibtex: '@article{guan2022self, + title={Self-supervised Character-to-Character Distillation}, + author={Guan, Tongkun and Shen, Wei}, + journal={arXiv preprint arXiv:2211.00288}, + year={2022} +}' diff --git a/paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml b/paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml new file mode 100644 index 000000000..1b8582336 --- /dev/null +++ b/paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml @@ -0,0 +1,79 @@ +Title: 'Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition' +Abbreviation: CornerTransformer +Tasks: + - TextRecog +Venue: ECCV +Year: 2022 +Lab/Company: + - Huazhong University of Science and Technology, China + - Adobe Research, USA +URL: + Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_18' + Arxiv: 'https://arxiv.org/abs/2208.00438' +Paper Reading URL: 'https://mp.weixin.qq.com/s/QRpZXv2EyU5hsoBqcFdztQ' +Code: 'https://github.com/xdxie/WordArt' +Supported In MMOCR: N/S +PaperType: + - Algorithm + - Dataset +Abstract: 'Artistic text recognition is an extremely challenging task with a +wide range of applications. However, current scene text recognition methods +mainly focus on irregular text while have not explored artistic text specifically. +The challenges of artistic text recognition include the various appearance +with special-designed fonts and effects, the complex connections and overlaps +between characters, and the severe interference from background patterns. To +alleviate these problems, we propose to recognize the artistic text at three +levels. Firstly, corner points are applied to guide the extraction of local +features inside characters, considering the robustness of corner structures to +appearance and shape. In this way, the discreteness of the corner points cuts +off the connection between characters, and the sparsity of them improves the +robustness for background interference. Secondly, we design a character +contrastive loss to model the character-level feature, improving the feature +representation for character classification. Thirdly, we utilize Transformer +to learn the global feature on image-level and model the global relationship of +the corner points, with the assistance of a corner-query cross-attention +mechanism. Besides, we provide an artistic text dataset to benchmark the +performance. Experimental results verify the significant superiority of our +proposed method on artistic text recognition and also achieve stateof-the-art +performance on several blurred and perspective datasets.' +MODELS: + Architecture: + - Transformer + Learning Method: + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211143209-24979724-6343-426a-9d2e-8d076dc4d48a.png' + FPS: + DEVICE: 'NVIDIA TITAN XP' + ITEM: 3.39 + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: 85.7M + Experiment: + Training DataSets: + - ST + - MJ + Test DataSets: + Avg.: 91.9 + IIIT5K: + WAICS: 95.9 + SVT: + WAICS: 94.6 + IC13: + WAICS: 96.4 + IC15: + WAICS: 86.3 + SVTP: + WAICS: 91.5 + CUTE: + WAICS: 92.0 +Bibtex: '@inproceedings{xie2022toward, + title={Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition}, + author={Xie, Xudong and Fu, Ling and Zhang, Zhifei and Wang, Zhaowen and Bai, Xiang}, + booktitle={European Conference on Computer Vision}, + pages={303--321}, + year={2022}, + organization={Springer} +}' diff --git a/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml new file mode 100644 index 000000000..d55eb0bb1 --- /dev/null +++ b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml @@ -0,0 +1,74 @@ +Title: 'SVTR: Scene Text Recognition with a Single Visual Model' +Abbreviation: Chu et al +Tasks: + - TextRecog +Venue: arXiv +Year: 2022 +Lab/Company: + - Wangxuan Institute of Computer Technology, Peking University + - State Key Lab of CAD & CG, Zhejiang University + - Ant Group +URL: + Venue: N/A + Arxiv: 'https://arxiv.org/abs/2203.06696' +Paper Reading URL: N/A +Code: 'https://github.com/VDIGPKU/STR-it' +Supported In MMOCR: N/S +PaperType: + - Algorithm +Abstract: 'The development of scene text recognition (STR) in the era of deep +learning has been mainly focused on novel architectures of STR models. However, +training protocol (i.e., settings of the hyperparameters involved in the +training of STR models), which plays an equally important role in successfully +training a good STR model, is under-explored for scene text recognition. In this +work, we attempt to improve the accuracy of existing STR models by searching +for optimal training protocol. Specifically, we develop a training protocol +search algorithm, based on a newly designed search space and an efficient search +algorithm using evolutionary optimization and proxy tasks. Experimental results +show that our searched training protocol can improve the recognition accuracy +of mainstream STR models by 2.7%∼3.9%. In particular, with the searched training +protocol, TRBA-Net achieves 2.1% higher accuracy than the state-of-the-art STR +model (i.e., EFIFSTR), while the inference speed is 2.3× and 3.7× faster on CPU +and GPU respectively. Extensive experiments are conducted to demonstrate the +effectiveness of the proposed method and the generalization ability of the +training protocol found by our search method. Code is available at +https://github.com/VDIGPKU/STR-it' +MODELS: + Architecture: + - Transformer + Learning Method: + - Supervised + Language Modality: + - Implicit Language Model + Network Structure: 'https://user-images.githubusercontent.com/65173622/211144450-26de55a1-bf77-4367-916d-cc5c33bd33a1.png' + FPS: + DEVICE: N/A + ITEM: N/A + FLOPS: + DEVICE: N/A + ITEM: N/A + PARAMS: N/A + Experiment: + Training DataSets: + - MJ + - ST + Test DataSets: + Avg.: N/A + IIIT5K: + WAICS: N/A + SVT: + WAICS: N/A + IC13: + WAICS: N/A + IC15: + WAICS: N/A + SVTP: + WAICS: N/A + CUTE: + WAICS: N/A +Bibtex: '@article{chu2022training, + title={Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching}, + author={Chu, Xiaojie and Wang, Yongtao and Shen, Chunhua and Chen, Jingdong and Chu, Wei}, + journal={arXiv preprint arXiv:2203.06696}, + year={2022} +}' From 30b21ae2bcaeafba8c56be7aca1328345c6ad3b7 Mon Sep 17 00:00:00 2001 From: Mountchciken Date: Wed, 18 Jan 2023 19:48:54 +0800 Subject: [PATCH 2/2] update --- ...mit of Scene Text Recognizer without Human Annotation.yaml | 4 ++-- ...cene Text Recognition via Training Protocol Searching.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml index 14329522b..cd864c873 100644 --- a/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml +++ b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml @@ -35,8 +35,8 @@ MODELS: Architecture: - Attenion Learning Method: - - Self Supervised - - Semi Supervised + - Self-Supervised + - Semi-Supervised - Supervised Language Modality: - Implicit Language Model diff --git a/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml index d55eb0bb1..0b8ea8aab 100644 --- a/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml +++ b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml @@ -35,7 +35,7 @@ training protocol found by our search method. Code is available at https://github.com/VDIGPKU/STR-it' MODELS: Architecture: - - Transformer + - Attention Learning Method: - Supervised Language Modality: