Skip to content

Add script footwear recognition #192

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions image_recognition_footwear/scripts/get_footwear
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python3

from __future__ import print_function
import argparse
from image_recognition_footwear.model import Model
from image_recognition_footwear.process_data import heroPreprocess, detection_RGB
from PIL import Image
import os
import torch

# Assign description to the help doc
parser = argparse.ArgumentParser(description='Get footwear detected using PyTorch')

# Add arguments
parser.add_argument('image', type=str, help='Image')
parser.add_argument('--weights-path', type=str, help='Path to the weights of the VGG model',
default=os.path.expanduser('~/data/pytorch_models/footwearModel.pth'))

parser.add_argument('--input-channel', type=int, help='Size of the input model channel', default=3)
parser.add_argument('--channel1-size', type=int, help='Size channel 1', default=128)
parser.add_argument('--channel2-size', type=int, help='Size channel 2', default=256)
parser.add_argument('--channel3-size', type=int, help='Size channel 3', default=512)
parser.add_argument('--nodes-fclayer1-size', type=int, help='Size fully connected layer 1 neurons', default=1024)
parser.add_argument('--nodes-fclayer2-size', type=int, help='Size fully connected layer 2 neurons', default=1024)
parser.add_argument('--class-size', type=int, help='Classes of the network', default=2)

device = torch.device('cuda')
dtype = torch.float32

args = parser.parse_args()

# Read the image and preprocess
img = Image.open(args.image)
preprocessed_img = heroPreprocess(img)

# Load the model
model = Model(in_channel=args.input_channel, channel_1=args.channel1_size, channel_2=args.channel2_size, channel_3=args.channel3_size, node_1=args.nodes_fclayer1_size, node_2=args.nodes_fclayer2_size, num_classes=args.class_size)
model.load_state_dict(torch.load(args.weights_path))
model.to(device=device)

# Detection
detector = detection_RGB(preprocessed_img, model)

print(detector)

5 changes: 1 addition & 4 deletions image_recognition_footwear/setup.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from setuptools import setup
from catkin_pkg.python_setup import generate_distutils_setup

d = generate_distutils_setup(
packages=['image_recognition_footwear'],
package_dir={'': 'src'}
)
d = generate_distutils_setup(packages=["image_recognition_footwear"], package_dir={"": "src"})

setup(**d)
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import model
from . import process_data
54 changes: 54 additions & 0 deletions image_recognition_footwear/src/image_recognition_footwear/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import torch
import torch.nn as nn


class Model(nn.Module):
def __init__(self, in_channels, channel_1, channel_2, channel_3, node_1, node_2, num_classes):
super().__init__()
####### Convolutional layers ######
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, channel_1, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2d(channel_1),
nn.LeakyReLU(),
nn.Conv2d(channel_1, channel_1, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2d(channel_1),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.conv2 = nn.Sequential(
nn.Conv2d(channel_1, channel_2, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2d(channel_2),
nn.LeakyReLU(),
nn.Conv2d(channel_2, channel_2, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2d(channel_2),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
)
self.conv3 = nn.Sequential(
nn.Conv2d(channel_2, channel_3, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2d(channel_3),
nn.LeakyReLU(),
nn.Conv2d(channel_3, channel_3, kernel_size=3, padding=1, stride=1),
nn.BatchNorm2d(channel_3),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=7, stride=2),
)

######## Affine layers ########
self.fc = nn.Sequential(
nn.Flatten(),
nn.Linear(channel_3, node_1),
nn.BatchNorm1d(node_1),
nn.Dropout(p=0.5),
nn.Linear(node_1, node_2),
nn.BatchNorm1d(node_2),
nn.Linear(node_2, num_classes),
)

def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)

scores = self.fc(x)
return scores
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from torchvision import transforms as T
import torch
from PIL import Image


def preprocess_rgb(img):
"""
Preproces image
Input is a PIL image.
Output image should be pytorch tensor that is compatible with your model"""
img = T.functional.resize(img, size=(32, 32), interpolation=Image.NEAREST)
trans = T.Compose(
[
T.ToTensor(),
T.Grayscale(num_output_channels=3),
T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]
)
img = trans(img)
img = img.unsqueeze(0)

return img


def hero_preprocess(img):
"""
Pre-proces image.
Expected input is a PIL image.
Output image should be pytorch tensor that is compatible with your model"""
width, height = img.size # Hero image size (640x480)
left = width / 2 - 100
top = height / 2 + 140
right = width / 2
bottom = height
im1 = img.crop((left, top, right, bottom))
img2 = T.functional.resize(im1, size=(32, 32), interpolation=Image.NEAREST)
trans = T.Compose([T.ToTensor(), T.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
img_trans = trans(img2)
img_trans = img_trans.unsqueeze(0)

return img_trans


def detection_rgb(img, model):
"""
Detection of footwear.
Input is a preprocessed image to provide to the model.
Output should be binary classification [True, False], where True is the detection of the footwear."""
model.eval()
info = next(model.parameters()) # Retrieve the first parameter tensor from the iterator
device = info.device
dtype = info.dtype
with torch.no_grad():
img = img.to(device=device, dtype=dtype)
scores = model(img)
preds = torch.argmax(scores, axis=1)
score_max_numpy = int(preds.cpu().detach().numpy())
return score_max_numpy
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions image_recognition_footwear/test/run_tests.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
nosetests -vv "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
46 changes: 46 additions & 0 deletions image_recognition_footwear/test/test_footwear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env python

import os
import re
from future.moves.urllib.request import urlretrieve
import unittest

from PIL import Image
import rospkg
from image_recognition_footwear.model import Model
from image_recognition_footwear.process_data import heroPreprocess, detection_RGB
import torch


@unittest.skip
def test_footwear():
local_path = "~/data/pytorch_models/footwearModel.pth"

if not os.path.exists(local_path):
print("File does not exit {}".format(local_path))

def is_there_footwear_from_asset_name(asset_name):
binary_str = re.search("(\w+)_shoe", asset_name).groups()
return binary_str == "yes"

assets_path = os.path.join(rospkg.RosPack().get_path("image_recognition_footwear"), "test/assets")
images_gt = [
(Image.open(os.path.join(assets_path, asset)), is_there_footwear_from_asset_name(asset))
for asset in os.listdir(assets_path)
]

device = torch.device("cuda")
model = Model(in_channel=3, channel_1=128, channel_2=256, channel_3=512, node_1=1024, node_2=1024, num_classes=2)
model.load_state_dict(torch.load(local_path))
model.to(device=device)
detections = detection_RGB([image for image, _ in images_gt], model)

estimations = AgeGenderEstimator(local_path, 64, 16, 8).estimate([image for image, _ in images_gt])

for (_, (is_footwear_gt)), (binary_detection) in zip(images_gt, detections):
binary_detection = int(binary_detection)
assert is_footwear_gt == binary_detection, f"{binary_detection=}, {is_footwear_gt=}"


if __name__ == "__main__":
test_footwear()