From fbd3b07ca4062c335e0cacfbfb81cc39c74b311a Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Sat, 7 Jan 2023 19:15:33 +0800
Subject: [PATCH 1/2] add 10 papers

---
 ...al Network for Scene Text Recognition.yaml | 75 +++++++++++++++++
 ...al Network for Scene Text Recognition.yaml | 76 +++++++++++++++++
 ...ence Model For Scene Text Recognition.yaml | 76 +++++++++++++++++
 ...t Recognizer without Human Annotation.yaml | 75 +++++++++++++++++
 ...nal Clues for Robust Text Recognition.yaml | 82 +++++++++++++++++++
 ...ecognition with a Single Visual Model.yaml | 74 +++++++++++++++++
 ...rmuted Autoregressive Sequence Models.yaml | 71 ++++++++++++++++
 ...ter Distillation for Text Recognition.yaml | 79 ++++++++++++++++++
 ...ransformer for Scene Text Recognition.yaml | 79 ++++++++++++++++++
 ...ition via Training Protocol Searching.yaml | 74 +++++++++++++++++
 10 files changed, 761 insertions(+)
 create mode 100644 paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml
 create mode 100644 paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml
 create mode 100644 paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml
 create mode 100644 paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml
 create mode 100644 paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml

diff --git a/paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml b/paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml
new file mode 100644
index 000000000..e79162f3d
--- /dev/null
+++ b/paper_zoo/textrecog/Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition.yaml	
@@ -0,0 +1,75 @@
+Title: 'Hamming OCR: A Locality Sensitive Hashing Neural Network for Scene Text Recognition'
+Abbreviation: HammingOCR
+Tasks:
+ - TextRecog
+Venue: arXiv
+Year: 2020
+Lab/Company:
+ - School of Computer and Information Technology, Beijing Jiaotong University, China
+ - Shanghai Collaborative Innovation Center of Intelligent Visual Computing, School of Computer Science, Fudan University, China
+ - Baidu Inc., China
+URL:
+  Venue: N/A
+  Arxiv: 'https://arxiv.org/abs/2009.10874'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Recently, inspired by Transformer, self-attention-based scene text
+recognition approaches have achieved outstanding performance. However, we find
+that the size of model expands rapidly with the lexicon increasing. Specifically,
+the number of parameters for softmax classification layer and output embedding
+layer are proportional to the vocabulary size. It hinders the development of a
+lightweight text recognition model especially applied for Chinese and multiple
+languages. Thus, we propose a lightweight scene text recognition model named
+Hamming OCR. In this model, a novel Hamming classifier, which adopts locality
+sensitive hashing (LSH) algorithm to encode each character, is proposed to
+replace the softmax regression and the generated LSH code is directly employed
+to replace the output embedding. We also present a simplified transformer
+decoder to reduce the number of parameters by removing the feed-forward network
+and using cross-layer parameter sharing technique. Compared with traditional
+methods, the number of parameters in both classification and embedding layers
+is independent on the size of vocabulary, which significantly reduces the
+storage requirement without loss of accuracy. Experimental results on several
+datasets, including four public benchmaks and a Chinese text dataset synthesized
+by SynthText with more than 20,000 characters, shows that Hamming OCR achieves
+competitive results.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211144293-2f94c36f-a3ec-44ac-a70c-4854ccfa90af.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: 6.6M
+ Experiment:
+   Training DataSets:
+     - MJ
+   Test DataSets:
+     Avg.: 74.0
+     IIIT5K:
+       WAICS: 82.6
+     SVT:
+       WAICS: 83.3
+     IC13:
+       WAICS: N/A
+     IC15:
+       WAICS: N/A
+     SVTP:
+       WAICS: 68.8
+     CUTE:
+       WAICS: 61.1
+Bibtex: '@article{li2020hamming,
+  title={Hamming ocr: A locality sensitive hashing neural network for scene text recognition},
+  author={Li, Bingcong and Tang, Xin and Qi, Xianbiao and Chen, Yihao and Xiao, Rong},
+  journal={arXiv preprint arXiv:2009.10874},
+  year={2020}
+}'
diff --git a/paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml b/paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml
new file mode 100644
index 000000000..a605f6c81
--- /dev/null
+++ b/paper_zoo/textrecog/MASTER: Multi-Aspect Non-local Network for Scene Text Recognition.yaml	
@@ -0,0 +1,76 @@
+Title: 'MASTER: Multi-Aspect Non-local Network for Scene Text Recognition'
+Abbreviation: MASTER
+Tasks:
+ - TextRecog
+Venue: PR
+Year: 2021
+Lab/Company:
+ - School of Computer and Information Technology, Beijing Jiaotong University, China
+ - Shanghai Collaborative Innovation Center of Intelligent Visual Computing, School of Computer Science, Fudan University, China
+ - Baidu Inc., China
+URL:
+  Venue: 'https://www.sciencedirect.com/science/article/pii/S0031320321001679'
+  Arxiv: 'https://arxiv.org/abs/1910.02562'
+Paper Reading URL: N/A
+Code: 'https://github.com/wenwenyu/MASTER-pytorch'
+Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/master'
+PaperType:
+ - Algorithm
+Abstract: 'Attention-based scene text recognizers have gained huge success, which
+leverages a more compact intermediate representation to learn 1d- or 2d- attention
+by a RNN-based encoder-decoder architecture. However, such methods suffer
+from attention-drift problem because high similarity among encoded features
+leads to attention confusion under the RNN-based local attention mechanism.
+Moreover, RNN-based methods have low efficiency due to poor parallelization.
+To overcome these problems, we propose the MASTER, a self-attention based scene
+text recognizer that (1) not only encodes the input-output attention but also
+learns self-attention which encodes feature-feature and target-target relationships
+inside the encoder and decoder and (2) learns a more powerful and robust
+intermediate representation to spatial distortion, and (3) owns a great training
+efficiency because of high training parallelization and a high-speed inference
+because of an efficient memory-cache mechanism. Extensive experiments on various
+benchmarks demonstrate the superior performance of our MASTER on both regular
+and irregular scene text. Pytorch code can be found at https://github.com/wenwenyu/MASTER-pytorch,
+and Tensorflow code can be found at https://github.com/jiangxiluning/MASTER-TF.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211144560-9732023f-fb02-415e-abfe-0b0ff0ab8425.png'
+ FPS:
+   DEVICE: 'NVIDIA 1080Ti'
+   ITEM: 55.5
+ FLOPS:
+   DEVICE: 'NVIDIA 1080Ti'
+   ITEM: 6.07G
+ PARAMS: 38.81M
+ Experiment:
+   Training DataSets:
+     - MJ
+     - ST
+   Test DataSets:
+     Avg.: 88.7
+     IIIT5K:
+       WAICS: 95.0
+     SVT:
+       WAICS: 90.6
+     IC13:
+       WAICS: 95.3
+     IC15:
+       WAICS: 79.4
+     SVTP:
+       WAICS: 84.5
+     CUTE:
+       WAICS: 87.5
+Bibtex: '@article{lu2021master,
+  title={Master: Multi-aspect non-local network for scene text recognition},
+  author={Lu, Ning and Yu, Wenwen and Qi, Xianbiao and Chen, Yihao and Gong, Ping and Xiao, Rong and Bai, Xiang},
+  journal={Pattern Recognition},
+  volume={117},
+  pages={107980},
+  year={2021},
+  publisher={Elsevier}
+}'
diff --git a/paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml b/paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml
new file mode 100644
index 000000000..f59546d76
--- /dev/null
+++ b/paper_zoo/textrecog/NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition.yaml	
@@ -0,0 +1,76 @@
+Title: 'NRTR: A No-Recurrence Sequence-to-Sequence Model For Scene Text Recognition'
+Abbreviation: NRTR
+Tasks:
+ - TextRecog
+Venue: ICDAR
+Year: 2019
+Lab/Company:
+ - Institute of Automation, Chinese Academy of Sciences University of Chinese Academy of Sciences
+URL:
+  Venue: 'https://ieeexplore.ieee.org/abstract/document/8978180/'
+  Arxiv: 'https://arxiv.org/abs/1806.00926'
+Paper Reading URL: N/A
+Code: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/nrtr'
+Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/nrtr'
+PaperType:
+ - Algorithm
+Abstract: 'Scene text recognition has attracted a great many researches due to
+its importance to various applications. Existing methods mainly adopt recurrence
+or convolution based networks. Though have obtained good performance, these
+methods still suffer from two limitations: slow training speed due to the
+internal recurrence of RNNs, and high complexity due to stacked convolutional
+layers for long-term feature extraction. This paper, for the first time,
+proposes a no-recurrence sequence-to-sequence text recognizer, named NRTR, that
+dispenses with recurrences and convolutions entirely. NRTR follows the
+encoder-decoder paradigm, where the encoder uses stacked self-attention to
+extract image features, and the decoder applies stacked self-attention to
+recognize texts based on encoder output. NRTR relies solely on self-attention
+mechanism thus could be trained with more parallelization and less complexity.
+Considering scene image has large variation in text and background, we further
+design a modality-transform block to effectively transform 2D input images to
+1D sequences, combined with the encoder to extract more discriminative features.
+ NRTR achieves state-of-the-art or highly competitive performance on both
+ regular and irregular benchmarks, while requires only a small fraction of
+ training time compared to the best model from the literature (at least 8
+ times faster).'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211147170-f8ceb124-cde4-4323-b770-493962cdfcb0.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - MJ
+     - ST
+   Test DataSets:
+     Avg.: 87.4
+     IIIT5K:
+       WAICS: 90.1
+     SVT:
+       WAICS: 91.5
+     IC13:
+       WAICS: 95.8
+     IC15:
+       WAICS: 79.4
+     SVTP:
+       WAICS: 86.6
+     CUTE:
+       WAICS: 80.9
+Bibtex: '@inproceedings{sheng2019nrtr,
+  title={NRTR: A no-recurrence sequence-to-sequence model for scene text recognition},
+  author={Sheng, Fenfen and Chen, Zhineng and Xu, Bo},
+  booktitle={2019 International conference on document analysis and recognition (ICDAR)},
+  pages={781--786},
+  year={2019},
+  organization={IEEE}
+}'
diff --git a/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml
new file mode 100644
index 000000000..14329522b
--- /dev/null
+++ b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml	
@@ -0,0 +1,75 @@
+Title: 'Pushing the Performance Limit of Scene Text Recognizer without Human Annotation'
+Abbreviation: Zheng et al
+Tasks:
+ - TextRecog
+Venue: CVPR
+Year: 2022
+Lab/Company:
+ - School of Computer Science and Ningbo Institute, Northwestern Polytechnical University, China
+ - Samsung Advanced Institute of Technology (SAIT), South Korea
+URL:
+  Venue: 'https://openaccess.thecvf.com/content/CVPR2022/html/Zheng_Pushing_the_Performance_Limit_of_Scene_Text_Recognizer_Without_Human_CVPR_2022_paper.html'
+  Arxiv: 'https://arxiv.org/abs/2204.07714'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Scene text recognition (STR) attracts much attention over the years
+because of its wide application. Most methods train STR model in a fully
+supervised manner which requires large amounts of labeled data. Although
+synthetic data contributes a lot to STR, it suffers from the real-tosynthetic
+domain gap that restricts model performance. In this work, we aim to boost
+STR models by leveraging both synthetic data and the numerous real unlabeled
+images, exempting human annotation cost thoroughly. A robust consistency
+regularization based semi-supervised framework is proposed for STR, which can
+effectively solve the instability issue due to domain inconsistency between
+synthetic and real images. A character-level consistency regularization is
+designed to mitigate the misalignment between characters in sequence recognition.
+Extensive experiments on standard text recognition benchmarks demonstrate
+the effectiveness of the proposed method. It can steadily improve existing
+STR models, and boost an STR model to achieve new state-of-the-art results.
+To our best knowledge, this is the first consistency regularization based
+framework that applies successfully to STR.'
+MODELS:
+ Architecture:
+  - Attenion
+ Learning Method:
+  - Self Supervised
+  - Semi Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211144099-f6db366a-e34b-401d-9b3c-13d08c1c1068.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - MJ
+     - ST
+   Test DataSets:
+     Avg.: 94.5
+     IIIT5K:
+       WAICS: 96.5
+     SVT:
+       WAICS: 96.3
+     IC13:
+       WAICS: 98.3
+     IC15:
+       WAICS: 89.3
+     SVTP:
+       WAICS: 93.3
+     CUTE:
+       WAICS: 93.4
+Bibtex: '@inproceedings{zheng2022pushing,
+  title={Pushing the Performance Limit of Scene Text Recognizer without Human Annotation},
+  author={Zheng, Caiyuan and Li, Hui and Rhee, Seon-Min and Han, Seungju and Han, Jae-Joon and Wang, Peng},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={14116--14125},
+  year={2022}
+}'
diff --git a/paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml b/paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml
new file mode 100644
index 000000000..82f9108eb
--- /dev/null
+++ b/paper_zoo/textrecog/RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition.yaml	
@@ -0,0 +1,82 @@
+Title: 'RobustScanner: Dynamically Enhancing Positional Clues for Robust Text Recognition'
+Abbreviation: RobustScanner
+Tasks:
+ - TextRecog
+Venue: ECCV
+Year: 2020
+Lab/Company:
+ - SenseTime Research, Hong Kong, China
+ - School of Cyber Science and Engineering, Xi’an Jiaotong University, Xi’an, China
+URL:
+  Venue: 'https://link.springer.com/chapter/10.1007/978-3-030-58529-7_9'
+  Arxiv: 'https://arxiv.org/abs/2007.07542'
+Paper Reading URL: N/A
+Code: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/robust_scanner'
+Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/robust_scanner'
+PaperType:
+ - Algorithm
+Abstract: 'The attention-based encoder-decoder framework has recently achieved
+impressive results for scene text recognition, and many variants have emerged
+with improvements in recognition quality. However, it performs poorly on
+contextless texts (e.g., random character sequences) which is unacceptable in
+most of real application scenarios. In this paper, we first deeply investigate
+the decoding process of the decoder. We empirically find that a representative
+character-level sequence decoder utilizes not only context information but also
+positional information. Contextual information, which the existing approaches
+heavily rely on, causes the problem of attention drift. To suppress such
+side-effect, we propose a novel position enhancement branch, and dynamically
+fuse its outputs with those of the decoder attention module for scene text
+recognition. Specifically, it contains a position aware module to enable the
+encoder to output feature vectors encoding their own spatial positions, and an
+attention module to estimate glimpses using the positional clue (i.e., the
+current decoding time step) only. The dynamic fusion is conducted for more
+robust feature via an element-wise gate mechanism. Theoretically, our proposed
+method, dubbed RobustScanner, decodes individual characters with dynamic ratio
+between context and positional clues, and utilizes more positional ones when
+the decoding sequences with scarce context, and thus is robust and practical.
+Empirically, it has achieved new state-of-the-art results on popular regular
+and irregular text recognition benchmarks while without much performance drop
+on contextless benchmarks, validating its robustness in both contextual and
+contextless application scenarios.'
+MODELS:
+ Architecture:
+  - Attention
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211147345-0515c292-00d1-458f-b5c7-b3a940a0c12c.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - MJ
+     - ST
+     - Real
+   Test DataSets:
+     Avg.: 88.9
+     IIIT5K:
+       WAICS: 95.4
+     SVT:
+       WAICS: 89.3
+     IC13:
+       WAICS: 94.1
+     IC15:
+       WAICS: 79.2
+     SVTP:
+       WAICS: 82.9
+     CUTE:
+       WAICS: 92.4
+Bibtex: '@inproceedings{yue2020robustscanner,
+  title={Robustscanner: Dynamically enhancing positional clues for robust text recognition},
+  author={Yue, Xiaoyu and Kuang, Zhanghui and Lin, Chenhao and Sun, Hongbin and Zhang, Wayne},
+  booktitle={European Conference on Computer Vision},
+  pages={135--151},
+  year={2020},
+  organization={Springer}
+}'
diff --git a/paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml b/paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml
new file mode 100644
index 000000000..f52858a26
--- /dev/null
+++ b/paper_zoo/textrecog/SVTR: Scene Text Recognition with a Single Visual Model.yaml	
@@ -0,0 +1,74 @@
+Title: 'SVTR: Scene Text Recognition with a Single Visual Model'
+Abbreviation: SVTP
+Tasks:
+ - TextRecog
+Venue: IJCAI
+Year: 2022
+Lab/Company:
+ - School of Computer and Information Technology, Beijing Jiaotong University, China
+ - Shanghai Collaborative Innovation Center of Intelligent Visual Computing, School of Computer Science, Fudan University, China
+ - Baidu Inc., China
+URL:
+  Venue: 'https://www.ijcai.org/proceedings/2022/0124.pdf'
+  Arxiv: 'https://arxiv.org/abs/2205.00159'
+Paper Reading URL: 'https://mp.weixin.qq.com/s/kR2CwHwE78STJiSfwlv9QA'
+Code: 'https://github.com/baudm/parseq'
+Supported In MMOCR: 'https://github.com/open-mmlab/mmocr/tree/1.x/configs/textrecog/svtr'
+PaperType:
+ - Algorithm
+Abstract: 'Dominant scene text recognition models commonly contain two building blocks,
+a visual model for feature extraction and a sequence model for text transcription.
+This hybrid architecture, although accurate, is complex and less efficient. In
+this study, we propose a Single Visual model for Scene Text recognition within
+the patch-wise image tokenization framework, which dispenses with the sequential
+modeling entirely. The method, termed SVTR, firstly decomposes an image text
+into small patches named character components. Afterward, hierarchical stages
+are recurrently carried out by component-level mixing, merging and/or combining.
+Global and local mixing blocks are devised to perceive the inter-character and
+intra-character patterns, leading to a multi-grained character component
+perception. Thus, characters are recognized by a simple linear prediction.
+Experimental results on both English and Chinese scene text recognition tasks
+demonstrate the effectiveness of SVTR. SVTR-L (Large) achieves highly
+competitive accuracy in English and outperforms existing methods by a large
+margin in Chinese, while running faster. In addition, SVTR-T (Tiny) is an
+effective and much smaller model, which shows appealing speed at inference.
+The code is publicly available at https://github.com/PaddlePaddle/PaddleOCR.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211143670-0913ccd8-1f5d-407b-8b64-e782f0cb037e.png'
+ FPS:
+   DEVICE: 'NVIDIA 1080Ti'
+   ITEM: 55.5
+ FLOPS:
+   DEVICE: 'NVIDIA 1080Ti'
+   ITEM: 6.07G
+ PARAMS: 38.81M
+ Experiment:
+   Training DataSets:
+     - MJ
+     - ST
+   Test DataSets:
+     Avg.: 92.6
+     IIIT5K:
+       WAICS: 96.3
+     SVT:
+       WAICS: 91.7
+     IC13:
+       WAICS: 97.2
+     IC15:
+       WAICS: 86.6
+     SVTP:
+       WAICS: 88.4
+     CUTE:
+       WAICS: 95.1
+Bibtex: '@article{du2022svtr,
+  title={SVTR: Scene Text Recognition with a Single Visual Model},
+  author={Du, Yongkun and Chen, Zhineng and Jia, Caiyan and Yin, Xiaoting and Zheng, Tianlun and Li, Chenxia and Du, Yuning and Jiang, Yu-Gang},
+  journal={arXiv preprint arXiv:2205.00159},
+  year={2022}
+}'
diff --git a/paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml b/paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml
new file mode 100644
index 000000000..865619aed
--- /dev/null
+++ b/paper_zoo/textrecog/Scene Text Recognition with Permuted Autoregressive Sequence Models.yaml	
@@ -0,0 +1,71 @@
+Title: 'Scene Text Recognition with Permuted Autoregressive Sequence Models'
+Abbreviation: PARSeq
+Tasks:
+ - TextRecog
+Venue: ECCV
+Year: 2022
+Lab/Company:
+ - Electrical and Electronics Engineering Institute, University of the Philippines, Diliman
+URL:
+  Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_11'
+  Arxiv: 'https://arxiv.org/abs/2207.06966'
+Paper Reading URL: N/A
+Code: 'https://github.com/baudm/parseq'
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Context-aware STR methods typically use internal autoregressive (AR)
+language models (LM). Inherent limitations of AR models motivated two-stage
+methods which employ an external LM. The conditional independence of the external
+LM on the input image may cause it to erroneously rectify correct predictions,
+leading to significant inefficiencies. Our method, PARSeq, learns an ensemble
+of internal AR LMs with shared weights using Permutation Language Modeling. It
+unifies context-free non-AR and context-aware AR inference, and iterative
+refinement using bidirectional context. Using synthetic training data, PARSeq
+achieves state-of-the-art (SOTA) results in STR benchmarks (91.9% accuracy)
+and more challenging datasets. It establishes new SOTA results (96.0% accuracy)
+when trained on real data. PARSeq is optimal on accuracy vs parameter count,
+FLOPS, and latency because of its simple, unified structure and parallel token
+processing. Due to its extensive use of attention, it is robust on
+arbitrarily-oriented text which is common in real-world images. Code, pretrained
+weights, and data are available at: https://github.com/baudm/parseq.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Explicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211143463-0d347e44-4ea9-4b17-857f-99f6e34378c2.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - Real
+   Test DataSets:
+     Avg.: 96.5
+     IIIT5K:
+       WAICS: 99.1
+     SVT:
+       WAICS: 97.9
+     IC13:
+       WAICS: 98.4
+     IC15:
+       WAICS: 89.6
+     SVTP:
+       WAICS: 95.7
+     CUTE:
+       WAICS: 98.3
+Bibtex: '@inproceedings{bautista2022scene,
+  title={Scene Text Recognition with Permuted Autoregressive Sequence Models},
+  author={Bautista, Darwin and Atienza, Rowel},
+  booktitle={European Conference on Computer Vision},
+  pages={178--196},
+  year={2022},
+  organization={Springer}
+}'
diff --git a/paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml b/paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml
new file mode 100644
index 000000000..848b2744b
--- /dev/null
+++ b/paper_zoo/textrecog/Self-supervised Character-to-Character Distillation for Text Recognition.yaml	
@@ -0,0 +1,79 @@
+Title: 'Self-supervised Character-to-Character Distillation for Text Recognition'
+Abbreviation: CCD
+Tasks:
+ - TextRecog
+Venue: arXiv
+Year: 2022
+Lab/Company:
+ - MoE Key Lab of Artificial Intelligence, AI Institute, Shanghai Jiao Tong University
+URL:
+  Venue: N/A
+  Arxiv: 'https://arxiv.org/abs/2211.00288'
+Paper Reading URL: N/A
+Code: N/A
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'Handling complicated text images (e.g., irregular structures, low
+resolution, heavy occlusion, and even illumination), existing supervised text
+recognition methods are data-hungry. Although these methods employ large-scale
+synthetic text images to reduce the dependence on annotated real images, the
+domain gap limits the recognition performance. Therefore, exploring the robust
+text feature representation on unlabeled real images by self-supervised learning
+is a good solution. However, existing selfsupervised text recognition methods
+only execute sequenceto-sequence representation learning by roughly splitting
+the visual features along the horizontal axis, which will damage the character
+structures. Besides, these sequential-level self-learning methods limit the
+availability of geometricbased data augmentation, as large-scale geometry
+augmentation leads to sequence-to-sequence inconsistency. To address the
+above-mentioned issues, we proposed a novel self-supervised character-to-character
+distillation method, CCD. Specifically, we delineate the character structures
+of unlabeled real images by designing a self-supervised character segmentation
+module, and further apply the segmentation results to build character-level
+representation learning. CCD differs from prior works in that we propose a
+character-level pretext task to learn more fine-grained feature representations.
+Besides, compared with the inflexible augmentations of sequence-to-sequence
+models, our work satisfies character-to-character representation consistency,
+across various transformations (e.g., geometry and colour), to generate robust
+text features in the representative space. Experiments demonstrate that CCD
+achieves state-of-the-art performance on publicly available text recognition
+benchmarks.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Self-Supervised
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211142941-06500063-59a7-485c-bfd3-817dc367a1a7.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - Real
+   Test DataSets:
+     Avg.: 94.3
+     IIIT5K:
+       WAICS: 97.1
+     SVT:
+       WAICS: 96.0
+     IC13:
+       WAICS: 97.5
+     IC15:
+       WAICS: 87.5
+     SVTP:
+       WAICS: 91.6
+     CUTE:
+       WAICS: 95.8
+Bibtex: '@article{guan2022self,
+  title={Self-supervised Character-to-Character Distillation},
+  author={Guan, Tongkun and Shen, Wei},
+  journal={arXiv preprint arXiv:2211.00288},
+  year={2022}
+}'
diff --git a/paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml b/paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml
new file mode 100644
index 000000000..1b8582336
--- /dev/null
+++ b/paper_zoo/textrecog/Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition.yaml	
@@ -0,0 +1,79 @@
+Title: 'Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition'
+Abbreviation: CornerTransformer
+Tasks:
+ - TextRecog
+Venue: ECCV
+Year: 2022
+Lab/Company:
+ - Huazhong University of Science and Technology, China
+ - Adobe Research, USA
+URL:
+  Venue: 'https://link.springer.com/chapter/10.1007/978-3-031-19815-1_18'
+  Arxiv: 'https://arxiv.org/abs/2208.00438'
+Paper Reading URL: 'https://mp.weixin.qq.com/s/QRpZXv2EyU5hsoBqcFdztQ'
+Code: 'https://github.com/xdxie/WordArt'
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+ - Dataset
+Abstract: 'Artistic text recognition is an extremely challenging task with a
+wide range of applications. However, current scene text recognition methods
+mainly focus on irregular text while have not explored artistic text specifically.
+The challenges of artistic text recognition include the various appearance
+with special-designed fonts and effects, the complex connections and overlaps
+between characters, and the severe interference from background patterns. To
+alleviate these problems, we propose to recognize the artistic text at three
+levels. Firstly, corner points are applied to guide the extraction of local
+features inside characters, considering the robustness of corner structures to
+appearance and shape. In this way, the discreteness of the corner points cuts
+off the connection between characters, and the sparsity of them improves the
+robustness for background interference. Secondly, we design a character
+contrastive loss to model the character-level feature, improving the feature
+representation for character classification. Thirdly, we utilize Transformer
+to learn the global feature on image-level and model the global relationship of
+the corner points, with the assistance of a corner-query cross-attention
+mechanism. Besides, we provide an artistic text dataset to benchmark the
+performance. Experimental results verify the significant superiority of our
+proposed method on artistic text recognition and also achieve stateof-the-art
+performance on several blurred and perspective datasets.'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211143209-24979724-6343-426a-9d2e-8d076dc4d48a.png'
+ FPS:
+   DEVICE: 'NVIDIA TITAN XP'
+   ITEM: 3.39
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: 85.7M
+ Experiment:
+   Training DataSets:
+     - ST
+     - MJ
+   Test DataSets:
+     Avg.: 91.9
+     IIIT5K:
+       WAICS: 95.9
+     SVT:
+       WAICS: 94.6
+     IC13:
+       WAICS: 96.4
+     IC15:
+       WAICS: 86.3
+     SVTP:
+       WAICS: 91.5
+     CUTE:
+       WAICS: 92.0
+Bibtex: '@inproceedings{xie2022toward,
+  title={Toward Understanding WordArt: Corner-Guided Transformer for Scene Text Recognition},
+  author={Xie, Xudong and Fu, Ling and Zhang, Zhifei and Wang, Zhaowen and Bai, Xiang},
+  booktitle={European Conference on Computer Vision},
+  pages={303--321},
+  year={2022},
+  organization={Springer}
+}'
diff --git a/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml
new file mode 100644
index 000000000..d55eb0bb1
--- /dev/null
+++ b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml	
@@ -0,0 +1,74 @@
+Title: 'SVTR: Scene Text Recognition with a Single Visual Model'
+Abbreviation: Chu et al
+Tasks:
+ - TextRecog
+Venue: arXiv
+Year: 2022
+Lab/Company:
+ - Wangxuan Institute of Computer Technology, Peking University
+ - State Key Lab of CAD & CG, Zhejiang University
+ - Ant Group
+URL:
+  Venue: N/A
+  Arxiv: 'https://arxiv.org/abs/2203.06696'
+Paper Reading URL: N/A
+Code: 'https://github.com/VDIGPKU/STR-it'
+Supported In MMOCR: N/S
+PaperType:
+ - Algorithm
+Abstract: 'The development of scene text recognition (STR) in the era of deep
+learning has been mainly focused on novel architectures of STR models. However,
+training protocol (i.e., settings of the hyperparameters involved in the
+training of STR models), which plays an equally important role in successfully
+training a good STR model, is under-explored for scene text recognition. In this
+work, we attempt to improve the accuracy of existing STR models by searching
+for optimal training protocol. Specifically, we develop a training protocol
+search algorithm, based on a newly designed search space and an efficient search
+algorithm using evolutionary optimization and proxy tasks. Experimental results
+show that our searched training protocol can improve the recognition accuracy
+of mainstream STR models by 2.7%∼3.9%. In particular, with the searched training
+protocol, TRBA-Net achieves 2.1% higher accuracy than the state-of-the-art STR
+model (i.e., EFIFSTR), while the inference speed is 2.3× and 3.7× faster on CPU
+and GPU respectively. Extensive experiments are conducted to demonstrate the
+effectiveness of the proposed method and the generalization ability of the
+training protocol found by our search method. Code is available at
+https://github.com/VDIGPKU/STR-it'
+MODELS:
+ Architecture:
+  - Transformer
+ Learning Method:
+  - Supervised
+ Language Modality:
+  - Implicit Language Model
+ Network Structure: 'https://user-images.githubusercontent.com/65173622/211144450-26de55a1-bf77-4367-916d-cc5c33bd33a1.png'
+ FPS:
+   DEVICE: N/A
+   ITEM: N/A
+ FLOPS:
+   DEVICE: N/A
+   ITEM: N/A
+ PARAMS: N/A
+ Experiment:
+   Training DataSets:
+     - MJ
+     - ST
+   Test DataSets:
+     Avg.: N/A
+     IIIT5K:
+       WAICS: N/A
+     SVT:
+       WAICS: N/A
+     IC13:
+       WAICS: N/A
+     IC15:
+       WAICS: N/A
+     SVTP:
+       WAICS: N/A
+     CUTE:
+       WAICS: N/A
+Bibtex: '@article{chu2022training,
+  title={Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching},
+  author={Chu, Xiaojie and Wang, Yongtao and Shen, Chunhua and Chen, Jingdong and Chu, Wei},
+  journal={arXiv preprint arXiv:2203.06696},
+  year={2022}
+}'

From 30b21ae2bcaeafba8c56be7aca1328345c6ad3b7 Mon Sep 17 00:00:00 2001
From: Mountchciken <mountchicken@outlook.com>
Date: Wed, 18 Jan 2023 19:48:54 +0800
Subject: [PATCH 2/2] update

---
 ...mit of Scene Text Recognizer without Human Annotation.yaml | 4 ++--
 ...cene Text Recognition via Training Protocol Searching.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml
index 14329522b..cd864c873 100644
--- a/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml	
+++ b/paper_zoo/textrecog/Pushing the Performance Limit of Scene Text Recognizer without Human Annotation.yaml	
@@ -35,8 +35,8 @@ MODELS:
  Architecture:
   - Attenion
  Learning Method:
-  - Self Supervised
-  - Semi Supervised
+  - Self-Supervised
+  - Semi-Supervised
   - Supervised
  Language Modality:
   - Implicit Language Model
diff --git a/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml
index d55eb0bb1..0b8ea8aab 100644
--- a/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml	
+++ b/paper_zoo/textrecog/Training Protocol Matters: Towards Accurate Scene Text Recognition via Training Protocol Searching.yaml	
@@ -35,7 +35,7 @@ training protocol found by our search method. Code is available at
 https://github.com/VDIGPKU/STR-it'
 MODELS:
  Architecture:
-  - Transformer
+  - Attention
  Learning Method:
   - Supervised
  Language Modality: