Building Cloud AI with Copilot – Faster R-CNN Azure HTTP Function SKU Results

Introduction

While testing the FasterRCNNObjectDetectionHttpTrigger function with Telerik Fiddler Classic and my “standard” test image I noticed the response bodies were different sizes.

Initially the application plan was an S1 SKU (1 vCPU 1.75G RAM)

The output JSON was 641 bytes

[
  {
    "label": "person",
    "score": 0.9998331,
    "box": {
      "x1": 445.9223, "y1": 124.11987, "x2": 891.18915, "y2": 696.37164
    }
  },
  {
    "label": "person",
    "score": 0.9994991,
    "box": {
      "x1": 0, "y1": 330.16595, "x2": 471.0475, "y2": 761.35846
    }
  },
  {
    "label": "baseball bat",
    "score": 0.9952342,
    "box": { "x1": 869.8053, "y1": 336.96188, "x2": 1063.2261, "y2": 467.74136
    }
  },
  {
    "label": "sports ball",
    "score": 0.9945949,
    "box": { "x1": 1040.916, "y1": 372.41507, "x2": 1071.8958, "y2": 402.50424
    }
  },
  {
    "label": "baseball glove",
    "score": 0.9943546,
    "box": {
      "x1": 377.8922, "y1": 431.95053, "x2": 458.4937, "y2": 536.52124
    }
  },
  {
    "label": "person",
    "score": 0.51779467,
    "box": {
      "x1": 0, "y1": 239.91418, "x2": 60.342667, "y2": 397.17004
    }
  }
]

The application plan was scaled to a Premium v3 P0V3 (1 vCPU 4G RAM)

The output JSON was 637 bytes

[
  {
    "label": "person",
    "score": 0.9998332,
    "box": {
      "x1": 445.9223, "y1": 124.1199, "x2": 891.18915, "y2": 696.3716
    }
  },
  {
    "label": "person",
    "score": 0.9994991,
    "box": { "x1": 0, "y1": 330.16595, "x2": 471.0475, "y2": 761.35846
    }
  },
  {
    "label": "baseball bat",
    "score": 0.9952342,
    "box": {
      "x1": 869.8053, "y1": 336.9619, "x2": 1063.2261, "y2": 467.74133
    }
  },
  {
    "label": "sports ball",
    "score": 0.994595,
    "box": {
      "x1": 1040.916, "y1": 372.41507, "x2": 1071.8958, "y2": 402.50424
    }
  },
  {
    "label": "baseball glove",
    "score": 0.9943546,
    "box": {
      "x1": 377.8922, "y1": 431.95053, "x2": 458.4937, "y2": 536.52124
    }
  },
  {
    "label": "person",
    "score": 0.51779467,
    "box": {
      "x1": 0, "y1": 239.91418, "x2": 60.342667, "y2": 397.17004
    }
  }
]

The application plan was scaled to Premium v3 P1V3 (2 vCPU 8G RAM)

The output JSON was 641 bytes

[
  {
    "label": "person",
    "score": 0.9998331,
    "box": {
      "x1": 445.9223, "y1": 124.11987, "x2": 891.18915, "y2": 696.37164
    }
  },
  {
    "label": "person",
    "score": 0.9994991,
    "box": {
      "x1": 0, "y1": 330.16595, "x2": 471.0475, "y2": 761.35846
    }
  },
  {
    "label": "baseball bat",
    "score": 0.9952342,
    "box": {
      "x1": 869.8053, "y1": 336.96188, "x2": 1063.2261, "y2": 467.74136
    }
  },
  {
    "label": "sports ball",
    "score": 0.9945949,
    "box": {
      "x1": 1040.916, "y1": 372.41507, "x2": 1071.8958, "y2": 402.50424
    }
  },
  {
    "label": "baseball glove",
    "score": 0.9943546,
    "box": {
      "x1": 377.8922, "y1": 431.95053, "x2": 458.4937, "y2": 536.52124
    }
  },
  {
    "label": "person",
    "score": 0.51779467,
    "box": {
      "x1": 0, "y1": 239.91418, "x2": 60.342667, "y2": 397.17004
    }
  }
]

The application plan was scaled to a Premium v3 P2V3 (4 vCPU 16G RAM)

The output JSON was 641 bytes

[
  {
    "label": "person",
    "score": 0.9998331,
    "box": {
      "x1": 445.9223, "y1": 124.11987, "x2": 891.18915, "y2": 696.37164
    }
  },
  {
    "label": "person",
    "score": 0.9994991,
    "box": {
      "x1": 0, "y1": 330.16595, "x2": 471.0475, "y2": 761.35846
    }
  },
  {
    "label": "baseball bat",
    "score": 0.9952342,
    "box": {
      "x1": 869.8053, "y1": 336.96188, "x2": 1063.2261, "y2": 467.74136
    }
  },
  {
    "label": "sports ball",
    "score": 0.9945949,
    "box": {
      "x1": 1040.916, "y1": 372.41507, "x2": 1071.8958, "y2": 402.50424
    }
  },
  {
    "label": "baseball glove",
    "score": 0.9943546,
    "box": {
      "x1": 377.8922, "y1": 431.95053, "x2": 458.4937, "y2": 536.52124 }
  },
  {
    "label": "person",
    "score": 0.51779467,
    "box": {
      "x1": 0, "y1": 239.91418, "x2": 60.342667, "y2": 397.17004
    }
  }
]

The application plan was scaled to a Premium v2 P1V2 (1vCPU 3.5G)

The output JSON was 637 bytes

[
  {
    "label": "person",
    "score": 0.9998332,
    "box": {
      "x1": 445.9223, "y1": 124.1199, "x2": 891.18915, "y2": 696.3716
    }
  },
  {
    "label": "person",
    "score": 0.9994991,
    "box": {
      "x1": 0, "y1": 330.16595, "x2": 471.0475, "y2": 761.35846
    }
  },
  {
    "label": "baseball bat",
    "score": 0.9952342,
    "box": {
      "x1": 869.8053, "y1": 336.9619, "x2": 1063.2261, "y2": 467.74133
    }
  },
  {
    "label": "sports ball",
    "score": 0.994595,
    "box": {
      "x1": 1040.916, "y1": 372.41507, "x2": 1071.8958, "y2": 402.50424
    }
  },
  {
    "label": "baseball glove",
    "score": 0.9943546,
    "box": {
      "x1": 377.8922, "y1": 431.95053, "x2": 458.4937, "y2": 536.52124
    }
  },
  {
    "label": "person",
    "score": 0.51779467,
    "box": {
      "x1": 0, "y1": 239.91418, "x2": 60.342667, "y2": 397.17004
    }
  }
]

Summary

The differences between the 637 & 641were small

Not certain why this could happen currently best guess is memory pressure.

Building Cloud AI with Copilot – Faster R-CNN Azure HTTP Function “Dog Food”

Introduction

A couple of months ago a web crawler visited every page on my website (would be interesting to know if my Github repositories were crawled as well) and I wondered if this might impact my Copilot or Github Copilot experiments. My blogging about The Azure HTTP Trigger functions with Ultralytics Yolo, YoloSharp, Resnet, Faster R-CNN, with Open Neural Network Exchange(ONNX) etc. is fairly “niche” so any improvements in the understanding of the problems and generated code might be visible.

please write an httpTrigger azure function that uses Faster RCNN and ONNX to detect the object in an image uploaded in the body of an HTTP Post

Github Copilot had used Sixlabors ImageSharp, the ILogger was injected into the constructor, the code checked that the image was in the body of the HTTP POST and the object classes were loaded from a text file. I had to manually add some Nugets and using directives before the code compiled and ran in the emulator, but this was a definite improvement.

To test the implementation, I was using Telerik Fiddler Classic to HTTP POST my “standard” test image to function.

Github Copilot had generated code that checked that the image was in the body of the HTTP POST so I had to modify the Telerik Fiddler Classic request.

I also had to fix up the content-type header

The path to the onnx file was wrong and I had to create a labels.txt file from Python code.

The Azure HTTP Trigger function ran but failed because the preprocessing of the image didn’t implement the specified preprocess steps.

Change DenseTensor to BGR (based on https://github.com/onnx/models/tree/main/validated/vision/object_detection_segmentation/faster-rcnn#preprocessing-steps)

Normalise colour values with mean = [102.9801, 115.9465, 122.7717]

The Azure HTTP Trigger function ran but failed because the output tensor names were incorrect

I used Netron to inspect the model properties to get the correct names for the output tensors

I had a couple of attempts at resizing the image to see what impact this had on the accuracy of the confidence and minimum bounding rectangles.

resize the image such that both height and width are within the range of [800, 1333], and then pad the image with zeros such that both height and width are divisible by 32.

modify the code to resize the image such that both height and width are within the range of [800, 1333], and then pad the image with zeros such that both height and width are divisible by 32 and the aspect ratio is not changed.

The final version of the image processing code scaled then right padded the image to keep the aspect ratio and MBR coordinates correct.

As a final test I deployed the code to Azure and the first time I ran the function it failed because the labels file couldn’t be found because Unix file paths are case sensitive (labels.txt vs. Labels.txt).

The inferencing time was a bit longer than I expected.

// please write an httpTrigger azure function that uses Faster RCNN and ONNX to detect the object in an image uploaded in the body of an HTTP Post
//    manually added the ML.Net ONNX NuGet + using directives
//    manually added the ImageSharp NuGet + using directives
//    Used Copilot to add Microsoft.ML.OnnxRuntime.Tensors using directive
//    Manually added ONNX FIle + labels file sorted out paths
//    Used Netron to fixup output tensor names
// Change DenseTensor to BGR (based on https://github.com/onnx/models/tree/main/validated/vision/object_detection_segmentation/faster-rcnn#preprocessing-steps)
// Normalise colour values with mean = [102.9801, 115.9465, 122.7717]
// resize the image such that both height and width are within the range of [800, 1333], and then pad the image with zeros such that both height and width are divisible by 32.
// modify the code to resize the image such that both height and width are within the range of [800, 1333], and then pad the image with zeros such that both height and width are divisible by 32 and the aspect ratio is not changed.
using Microsoft.AspNetCore.Http;
using Microsoft.AspNetCore.Mvc;
using Microsoft.Azure.Functions.Worker;
using Microsoft.Extensions.Logging;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using SixLabors.ImageSharp; // Couldn't get inteliisense after adding NuGet package
using SixLabors.ImageSharp.PixelFormats; // Couldn't get inteliisense after adding NuGet package
using SixLabors.ImageSharp.Processing; // Couldn't get inteliisense after adding NuGet package


namespace FasterRCNNObjectDetectionHttpTriggerGithubCopilot
{
   public class Function1
   {
      private readonly ILogger<Function1> _logger;
      private readonly InferenceSession _session;
      private readonly List<string> _labels;

      public Function1(ILogger<Function1> logger)
      {
         _logger = logger;
         _session = new InferenceSession("FasterRCNN-10.onnx");
         _labels = File.ReadAllLines("labels.txt").ToList();
      }

      [Function("ObjectDetectionFunction")]
      public async Task<IActionResult> Run([HttpTrigger(AuthorizationLevel.Function, "post", Route = null)] HttpRequest req)
      {
         if (!req.ContentType.StartsWith("image/"))
            return new BadRequestObjectResult("Content-Type must be an image.");

         using var ms = new MemoryStream();
         await req.Body.CopyToAsync(ms);
         ms.Position = 0;

         using var image = Image.Load<Rgb24>(ms);
         var inputTensor = PreprocessImage(image);

         var inputs = new List<NamedOnnxValue>
                  {
                      NamedOnnxValue.CreateFromTensor("image", inputTensor)
                  };

         using IDisposableReadOnlyCollection<DisposableNamedOnnxValue> results = _session.Run(inputs);
         var output = results.ToDictionary(x => x.Name, x => x.Value);

         var boxes = (DenseTensor<float>)output["6379"];
         var labels = (DenseTensor<long>)output["6381"];
         var scores = (DenseTensor<float>)output["6383"];

         var detections = new List<object>();
         for (int i = 0; i < scores.Length; i++)
         {
            if (scores[i] > 0.5)
            {
               detections.Add(new
               {
                  label = _labels[(int)labels[i]],
                  score = scores[i],
                  box = new
                  {
                     x1 = boxes[i, 0],
                     y1 = boxes[i, 1],
                     x2 = boxes[i, 2],
                     y2 = boxes[i, 3]
                  }
               });
            }
         }

         return new OkObjectResult(detections);
      }

      private static DenseTensor<float> PreprocessImage( Image<Rgb24> image)
      {
         // Step 1: Resize so that min(H, W) = 800, max(H, W) <= 1333, keeping aspect ratio
         int origWidth = image.Width;
         int origHeight = image.Height;
         int minSize = 800;
         int maxSize = 1333;

         float scale = Math.Min((float)minSize / Math.Min(origWidth, origHeight),
                                (float)maxSize / Math.Max(origWidth, origHeight));
         /*
         float scale = 1.0f;

         // If either dimension is less than 800, scale up so the smaller is 800
         if (origWidth < minSize || origHeight < minSize)
         {
            scale = Math.Max((float)minSize / origWidth, (float)minSize / origHeight);
         }
         // If either dimension is greater than 1333, scale down so the larger is 1333
         if (origWidth * scale > maxSize || origHeight * scale > maxSize)
         {
            scale = Math.Min((float)maxSize / origWidth, (float)maxSize / origHeight);
         }
         */

         int resizedWidth = (int)Math.Round(origWidth * scale);
         int resizedHeight = (int)Math.Round(origHeight * scale);

         image.Mutate(x => x.Resize(resizedWidth, resizedHeight));

         // Step 2: Pad so that both dimensions are divisible by 32
         int padWidth = ((resizedWidth + 31) / 32) * 32;
         int padHeight = ((resizedHeight + 31) / 32) * 32;

         var paddedImage = new Image<Rgb24>(padWidth, padHeight);
         paddedImage.Mutate(ctx => ctx.DrawImage(image, new Point(0, 0), 1f));

         // Step 3: Convert to BGR and normalize
         float[] mean = { 102.9801f, 115.9465f, 122.7717f };
         var tensor = new DenseTensor<float>(new[] { 3, padHeight, padWidth });

         for (int y = 0; y < padHeight; y++)
         {
            for (int x = 0; x < padWidth; x++)
            {
               Rgb24 pixel = default;
               if (x < resizedWidth && y < resizedHeight)
                  pixel = paddedImage[x, y];

               tensor[0, y, x] = pixel.B - mean[0];
               tensor[1, y, x] = pixel.G - mean[1];
               tensor[2, y, x] = pixel.R - mean[2];
            }
         }

         paddedImage.Dispose();
         return tensor;
      }
   }
}

It took roughly an hour to “vibe code” the function, but it would have taken much longer for someone not familiar with the problem domain.

Summary

The Github Copilot generated code was okay but would be fragile, performance would suck and not scale terribly well.

The Copilot generated code in this post is not suitable for production

ONNXRuntime.AI-Faster R-CNN C# Sample differences

After building Faster R-CCN object detection applications with Copilot and Github Copilot the results when compared with the onnxruntime.ai Object detection with Faster RCNN Deep Learning in C# sample (which hasn’t been updated for years) were slightly different.

The sample image was 640×480 pixels

The FasterRCNNObjectDetectionApplicationGitHubCopilot application scaled image was initially 1056×800 then 1088×800 pixels.

The initial version the dimensions were “rounded down” to the next multiple of 32

// Calculate scale factor to fit within the range while maintaining aspect ratio
float scale = Math.Min((float)maxSize / Math.Max(originalWidth, originalHeight),
                                (float)minSize / Math.Min(originalWidth, originalHeight));

// Calculate new dimensions
int newWidth = (int)(originalWidth * scale);
int newHeight = (int)(originalHeight * scale);

// Ensure dimensions are divisible by 32
newWidth = (newWidth / divisor) * divisor;
newHeight = (newHeight / divisor) * divisor;
Scaled 1056×800

Then for the second version the dimensions were “rounded up” to the next multiple of 32

// Calculate scale factor to fit within the range while maintaining aspect ratio
float scale = Math.Min((float)maxSize / Math.Max(originalWidth, originalHeight),
                                (float)minSize / Math.Min(originalWidth, originalHeight));

// Calculate new dimensions
int newWidth = (int)(originalWidth * scale);
int newHeight = (int)(originalHeight * scale);

// Ensure dimensions are divisible by 32
newWidth = (int)(Math.Ceiling(newWidth / 32f) * 32f);
newHeight = (int)(Math.Ceiling(newHeight / 32f) * 32f);
Scaled 1088×800
Marked up 1088×800

The FasterRCNNObjectDetectionApplicationOriginal application scaled the input image to 1066×800

Scaled image 1066×800

The FasterRCNNObjectDetectionApplicationOriginal application pillar boxed/padded the image to 1088×800 as the DenseTensor was loaded.

using Image<Rgb24> image = Image.Load<Rgb24>(imageFilePath);

Console.WriteLine($"Before x:{image.Width} y:{image.Height}");

// Resize image
float ratio = 800f / Math.Min(image.Width, image.Height);
image.Mutate(x => x.Resize((int)(ratio * image.Width), (int)(ratio * image.Height)));

Console.WriteLine($"After x:{image.Width} y:{image.Height}");

// Preprocess image
var paddedHeight = (int)(Math.Ceiling(image.Height / 32f) * 32f);
var paddedWidth = (int)(Math.Ceiling(image.Width / 32f) * 32f);

Console.WriteLine($"Padded x:{paddedWidth} y:{paddedHeight}");

Tensor<float> input = new DenseTensor<float>(new[] { 3, paddedHeight, paddedWidth });
var mean = new[] { 102.9801f, 115.9465f, 122.7717f };
image.ProcessPixelRows(accessor =>
{
   for (int y = paddedHeight - accessor.Height; y < accessor.Height; y++)
   {
      Span<Rgb24> pixelSpan = accessor.GetRowSpan(y);
      for (int x = paddedWidth - accessor.Width; x < accessor.Width; x++)
      {
         input[0, y, x] = pixelSpan[x].B - mean[0];
         input[1, y, x] = pixelSpan[x].G - mean[1];
         input[2, y, x] = pixelSpan[x].R - mean[2];
      }
   }
});
Marked up image 1066×800

I think the three different implementations of the preprocessing steps and the graphics libraries used probably caused the differences in the results. The way an image is “resized” by System.Graphics.Common vs. ImageSharp(resampled, cropped and centered or padded and pillar boxed) could make a significant difference to the results.

ONNXRuntime.AI-Faster R-CNN C# Sample oddness

After building Faster R-CCN object detection applications with Copilot and Github Copilot the results when compared with Utralytics Yolo (with YoloSharp) didn’t look too bad.

The input image sports.jpg 1200×798 pixels

The GithubCopilot FasterRCNNObjectDetectionApplicationCopilot application only generated labels, confidences and minimum bounding box coordinates.

The FasterRCNNObjectDetectionApplicationGitHubCopilot application the marked-up image was 1200×798 pixels

The YoloSharpObjectDetectionApplication application marked-up image was 1200×798 pixels

I went back to the onnxruntime.ai Object detection with Faster RCNN Deep Learning in C# sample source code to check my implementations and the highlighted area on the left caught my attention.

The FasterRCNNObjectDetectionApplicationOriginal application marked up image was 1023×800

I downloaded the sample code which hadn’t been updated for years.

public static void Main(string[] args)
{
   Console.WriteLine("FasterRCNNObjectDetectionApplicationOriginal");

   // Read paths
   string modelFilePath = args[0];
   string imageFilePath = args[1];
   string outImageFilePath = args[2];

   // Read image
   using Image<Rgb24> image = Image.Load<Rgb24>(imageFilePath);

   // Resize image
   float ratio = 800f / Math.Min(image.Width, image.Height);
   image.Mutate(x => x.Resize((int)(ratio * image.Width), (int)(ratio * image.Height)));

   // Preprocess image
   var paddedHeight = (int)(Math.Ceiling(image.Height / 32f) * 32f);
   var paddedWidth = (int)(Math.Ceiling(image.Width / 32f) * 32f);
   Tensor<float> input = new DenseTensor<float>(new[] { 3, paddedHeight, paddedWidth });
   var mean = new[] { 102.9801f, 115.9465f, 122.7717f };
   image.ProcessPixelRows(accessor =>
   {
      for (int y = paddedHeight - accessor.Height; y < accessor.Height; y++)
      {
         Span<Rgb24> pixelSpan = accessor.GetRowSpan(y);
         for (int x = paddedWidth - accessor.Width; x < accessor.Width; x++)
         {
            input[0, y, x] = pixelSpan[x].B - mean[0];
            input[1, y, x] = pixelSpan[x].G - mean[1];
            input[2, y, x] = pixelSpan[x].R - mean[2];
         }
      }
   });

   // Setup inputs and outputs
   var inputs = new List<NamedOnnxValue>
      {
            NamedOnnxValue.CreateFromTensor("image", input)
      };

   // Run inference
   using var session = new InferenceSession(modelFilePath);
   using IDisposableReadOnlyCollection<DisposableNamedOnnxValue> results = session.Run(inputs);

   // Postprocess to get predictions
   var resultsArray = results.ToArray();
   float[] boxes = resultsArray[0].AsEnumerable<float>().ToArray();
   long[] labels = resultsArray[1].AsEnumerable<long>().ToArray();
   float[] confidences = resultsArray[2].AsEnumerable<float>().ToArray();
   var predictions = new List<Prediction>();
   var minConfidence = 0.7f;
   for (int i = 0; i < boxes.Length - 4; i += 4)
   {
      var index = i / 4;
      if (confidences[index] >= minConfidence)
      {
         predictions.Add(new Prediction
         {
            Box = new Box(boxes[i], boxes[i + 1], boxes[i + 2], boxes[i + 3]),
            Label = LabelMap.Labels[labels[index]],
            Confidence = confidences[index]
         });
      }
   }

   // Put boxes, labels and confidence on image and save for viewing
   using var outputImage = File.OpenWrite(outImageFilePath);
   Font font = SystemFonts.CreateFont("Arial", 16);
   foreach (var p in predictions)
   {
      Console.WriteLine($"Label: {p.Label}, Confidence: {p.Confidence}, Bounding Box:[{p.Box.Xmin}, {p.Box.Ymin}, {p.Box.Xmax}, {p.Box.Ymax}]");
      image.Mutate(x =>
      {
         x.DrawLine(Color.Red, 2f, new PointF[] {

                  new PointF(p.Box.Xmin, p.Box.Ymin),
                  new PointF(p.Box.Xmax, p.Box.Ymin),

                  new PointF(p.Box.Xmax, p.Box.Ymin),
                  new PointF(p.Box.Xmax, p.Box.Ymax),

                  new PointF(p.Box.Xmax, p.Box.Ymax),
                  new PointF(p.Box.Xmin, p.Box.Ymax),

                  new PointF(p.Box.Xmin, p.Box.Ymax),
                  new PointF(p.Box.Xmin, p.Box.Ymin)
               });
         x.DrawText($"{p.Label}, {p.Confidence:0.00}", font, Color.White, new PointF(p.Box.Xmin, p.Box.Ymin));
      });
   }
   image.SaveAsJpeg(outputImage);

   Console.WriteLine("Press Enter to exit");
   Console.ReadLine();
}

I then compared the output of the object detection applications and the onnxruntime.ai Object detection with Faster RCNN Deep Learning in C# sample was different.

After some investigation I think the scaling of the image used for inferencing (based on the requirements on the model), then the scaling of the minimum bounding rectangles isn’t quite right.