From affffbf3ce6b17c5cc30de1484613300c9d1c82d Mon Sep 17 00:00:00 2001 From: jamjamjon Date: Sat, 21 Sep 2024 18:44:27 +0800 Subject: [PATCH] ! --- README.md | 5 +- examples/florence2/main.rs | 6 +- src/lib.rs | 2 +- src/models/florence2.rs | 216 ++++++++++++++++++------------------- 4 files changed, 109 insertions(+), 120 deletions(-) diff --git a/README.md b/README.md index a3ba257..9090e04 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ - **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10) - **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) - **Vision Models**: [RTDETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [DB](https://arxiv.org/abs/1911.08947), [SVTR](https://arxiv.org/abs/2205.00159), [Depth-Anything-v1-v2](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569) -- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World) +- **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242)
Click to expand Supported Models @@ -71,6 +71,9 @@ | [MODNet](https://github.com/ZHKKKe/MODNet) | Image Matting | [demo](examples/modnet) | ✅ | ✅ | ✅ | ✅ | | [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) | Open-Set Detection With Language | [demo](examples/grounding-dino) | ✅ | ✅ | | | | [Sapiens](https://github.com/facebookresearch/sapiens/tree/main) | Body Part Segmentation | [demo](examples/sapiens) | ✅ | ✅ | | | +| [Florence2](https://arxiv.org/abs/2311.06242) | a Variety of Vision Tasks | [demo](examples/florence2) | ✅ | ✅ | | | + +
diff --git a/examples/florence2/main.rs b/examples/florence2/main.rs index 4d0859a..546ebb9 100644 --- a/examples/florence2/main.rs +++ b/examples/florence2/main.rs @@ -232,15 +232,11 @@ fn main() -> Result<(), Box> { Task::ReferringExpressionSegmentation(_) => { let annotator = annotator .clone() - .with_polygons_alpha(200) .with_saveout("Referring-Expression-Segmentation"); annotator.annotate(&xs, ys_); } Task::RegionToSegmentation(..) => { - let annotator = annotator - .clone() - .with_polygons_alpha(200) - .with_saveout("Region-To-Segmentation"); + let annotator = annotator.clone().with_saveout("Region-To-Segmentation"); annotator.annotate(&xs, ys_); } Task::OcrWithRegion => { diff --git a/src/lib.rs b/src/lib.rs index fd63333..ce9d586 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,7 @@ //! - **YOLO Models**: [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10) //! - **SAM Models**: [SAM](https://github.com/facebookresearch/segment-anything), [SAM2](https://github.com/facebookresearch/segment-anything-2), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM) //! - **Vision Models**: [RTDETR](https://arxiv.org/abs/2304.08069), [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo), [DB](https://arxiv.org/abs/1911.08947), [SVTR](https://arxiv.org/abs/2205.00159), [Depth-Anything-v1-v2](https://github.com/LiheYoung/Depth-Anything), [DINOv2](https://github.com/facebookresearch/dinov2), [MODNet](https://github.com/ZHKKKe/MODNet), [Sapiens](https://arxiv.org/abs/2408.12569) -//! - **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World) +//! - **Vision-Language Models**: [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [Florence2](https://arxiv.org/abs/2311.06242) //! //! # Examples //! diff --git a/src/models/florence2.rs b/src/models/florence2.rs index 8bf9e1e..66eb497 100644 --- a/src/models/florence2.rs +++ b/src/models/florence2.rs @@ -17,9 +17,9 @@ pub struct Florence2 { pub encoder: OrtEngine, pub decoder: OrtEngine, pub decoder_merged: OrtEngine, - pub height: MinOptMax, - pub width: MinOptMax, - pub batch: MinOptMax, + height: MinOptMax, + width: MinOptMax, + batch: MinOptMax, tokenizer: Tokenizer, max_length: usize, quantizer: Quantizer, @@ -97,13 +97,15 @@ impl Florence2 { xs: &[DynamicImage], tasks: &[Task], ) -> Result>> { - // encode batch images + let mut ys: BTreeMap> = BTreeMap::new(); + + // encode images let image_embeddings = self.encode_images(xs)?; // note: the length of xs is not always equal to batch size self.batch.update(xs.len() as isize); - // tasks loop + // build pb let pb = build_progress_bar( tasks.len() as u64, " Working On", @@ -111,14 +113,11 @@ impl Florence2 { crate::PROGRESS_BAR_STYLE_CYAN_2, )?; - let mut ys: BTreeMap> = BTreeMap::new(); + // tasks for task in tasks.iter() { - // update pb pb.inc(1); pb.set_message(format!("{:?}", task)); - let mut ys_task: Vec = Vec::new(); - // construct prompt and encode let input_ids = self .encode_prompt(task)? @@ -129,103 +128,98 @@ impl Florence2 { // run let texts = self.run_batch(&image_embeddings, &text_embeddings)?; - // postprocess - for batch in 0..self.batch() { - // image size - let image_width = xs[batch].width() as usize; - let image_height = xs[batch].height() as usize; - - // texts cleanup - let text = texts[batch] - .as_str() - .replace("", "") - .replace("", "") - .replace("", ""); - - // cope with each task - if let Task::Caption(_) | Task::Ocr = task { - ys_task.push(Y::default().with_texts(&[text])); - } else { - let elems = Self::loc_parse(&text)?; - match task { - Task::RegionToCategory(..) | Task::RegionToDescription(..) => { - let text = elems[0][0].clone(); // extract text only - ys_task.push(Y::default().with_texts(&[text])); - } - Task::ObjectDetection - | Task::OpenSetDetection(_) - | Task::DenseRegionCaption - | Task::CaptionToPhraseGrounding(_) => { - let y_bboxes: Vec = elems - .par_iter() - .enumerate() - .flat_map(|(i, elem)| { - let name = &elem[0]; - let y_bboxes: Vec = Self::process_bboxes( - &elem[1..], - &self.quantizer, - image_width, - image_height, - Some((name, i)), - ); - y_bboxes - }) - .collect(); - - ys_task.push(Y::default().with_bboxes(&y_bboxes)); - } - Task::RegionProposal => { - let y_bboxes: Vec = Self::process_bboxes( - &elems[0], - &self.quantizer, - image_width, - image_height, - None, - ); - - ys_task.push(Y::default().with_bboxes(&y_bboxes)); - } - - Task::ReferringExpressionSegmentation(_) - | Task::RegionToSegmentation(..) => { - let points = Self::process_polygons( - &elems[0], - &self.quantizer, - image_width, - image_height, - ); - - ys_task.push(Y::default().with_polygons(&[ - Polygon::default().with_points(&points).with_id(0), - ])); - } - Task::OcrWithRegion => { - let y_polygons: Vec = elems - .par_iter() - .enumerate() - .map(|(i, elem)| { - let text = &elem[0]; - let points = Self::process_polygons( - &elem[1..], - &self.quantizer, - image_width, - image_height, - ); - - Polygon::default() - .with_name(text) - .with_points(&points) - .with_id(i as _) - }) - .collect(); - - ys_task.push(Y::default().with_polygons(&y_polygons)); - } - - _ => anyhow::bail!("Unsupported Florence2 task."), - }; - } - } + // tasks iteration + let ys_task = (0..self.batch()) + .into_par_iter() + .map(|batch| { + // image size + let image_width = xs[batch].width() as usize; + let image_height = xs[batch].height() as usize; + + // texts cleanup + let text = texts[batch] + .as_str() + .replace("", "") + .replace("", "") + .replace("", ""); + + // postprocess + let mut y = Y::default(); + if let Task::Caption(_) | Task::Ocr = task { + y = y.with_texts(&[text]); + } else { + let elems = Self::loc_parse(&text)?; + match task { + Task::RegionToCategory(..) | Task::RegionToDescription(..) => { + let text = elems[0][0].clone(); + y = y.with_texts(&[text]); + } + Task::ObjectDetection + | Task::OpenSetDetection(_) + | Task::DenseRegionCaption + | Task::CaptionToPhraseGrounding(_) => { + let y_bboxes: Vec = elems + .par_iter() + .enumerate() + .flat_map(|(i, elem)| { + Self::process_bboxes( + &elem[1..], + &self.quantizer, + image_width, + image_height, + Some((&elem[0], i)), + ) + }) + .collect(); + y = y.with_bboxes(&y_bboxes); + } + Task::RegionProposal => { + let y_bboxes: Vec = Self::process_bboxes( + &elems[0], + &self.quantizer, + image_width, + image_height, + None, + ); + y = y.with_bboxes(&y_bboxes); + } + Task::ReferringExpressionSegmentation(_) + | Task::RegionToSegmentation(..) => { + let points = Self::process_polygons( + &elems[0], + &self.quantizer, + image_width, + image_height, + ); + y = y.with_polygons(&[Polygon::default() + .with_points(&points) + .with_id(0)]); + } + Task::OcrWithRegion => { + let y_polygons: Vec = elems + .par_iter() + .enumerate() + .map(|(i, elem)| { + let points = Self::process_polygons( + &elem[1..], + &self.quantizer, + image_width, + image_height, + ); + Polygon::default() + .with_name(&elem[0]) + .with_points(&points) + .with_id(i as _) + }) + .collect(); + y = y.with_polygons(&y_polygons); + } + _ => anyhow::bail!("Unsupported Florence2 task."), + }; + } + Ok(y) + }) + .collect::>>()?; ys.insert(task.clone(), ys_task); } @@ -264,19 +258,14 @@ impl Florence2 { let encoder_k0 = decoder_outputs[3].clone(); let encoder_v0 = decoder_outputs[4].clone(); - let encoder_k1 = decoder_outputs[7].clone(); let encoder_v1 = decoder_outputs[8].clone(); - let encoder_k2 = decoder_outputs[11].clone(); let encoder_v2 = decoder_outputs[12].clone(); - let encoder_k3 = decoder_outputs[15].clone(); let encoder_v3 = decoder_outputs[16].clone(); - let encoder_k4 = decoder_outputs[19].clone(); let encoder_v4 = decoder_outputs[20].clone(); - let encoder_k5 = decoder_outputs[23].clone(); let encoder_v5 = decoder_outputs[24].clone(); @@ -285,8 +274,9 @@ impl Florence2 { // save last batch tokens let mut last_tokens: Vec = vec![0.; self.batch()]; - let mut logits_sampler = LogitsSampler::new(); + + // generate for _ in 0..self.max_length { let logits = &decoder_outputs["logits"]; let decoder_k0 = &decoder_outputs[1]; @@ -302,7 +292,7 @@ impl Florence2 { let decoder_k5 = &decoder_outputs[21]; let decoder_v5 = &decoder_outputs[22]; - // Decode each token for each batch + // decode each token for each batch for (i, logit) in logits.axis_iter(Axis(0)).enumerate() { if !finished[i] { let token_id = logits_sampler.decode(