Collecting Data & Remove

Collecting Data from the Web with Python If you like

Remove duplicate image

1. Remove invalid images

import os
import sys
import cv2

from collect_data import SUPPORTED_FORMATS

input_path = sys.argv[1]

for root, dirs, files in os.walk(input_path):
for filename in files:
ext = filename[filename.rfind(‘.’)+1:].lower()
if ext not in SUPPORTED_FORMATS:
continue
filepath = os.sep.join([root, filename])
if cv2.imread(filepath) is None:
os.system(‘rm {}’.format(filepath))
print(‘{} is not a valid image file. Deleted!’.format(filepath))

2. Remove duplicate files

FDUPES is a program for identifying duplicate files residing within specified directories.

sudo apt-get install fdupes
fdupes -rdN ./

3. Remove visually similar or duplicate images

3.1 Down scale

import os
import cv2
import sys

input_path = sys.argv[1].rstrip(os.sep)

target_short_edge = int(sys.argv[2])

for root, dirs, files in os.walk(input_path):
print(‘scanning {} …’.format(root))
for filename in files:
filepath = os.sep.join([root, filename])
img = cv2.imread(filepath)
h, w = img.shape[:2]
short_edge = min(w, h)

    if short\_edge > target\_short\_edge:
        scale = float(target\_short\_edge) / float(short\_edge)
        new\_w = int(round(w \* scale))
        new\_h = int(round(h \* scale))
        print('Down sampling {} from {} x {} to {} x {} ...'.format(
            filepath, w, w, new\_w, new\_h
        ))
        img = cv2.resize(img, (new\_w, new\_h))
        cv2.imwrite(filepath, img)

print(‘Done!’)

3.2 Find image dupes

sudo apt-get install findimagedupes
findimagedupes -R train > dup_list

3.3 Remove duplicates

import os
import sys
dup_list = sys.argv[1]
with open(dup_list, ‘r’) as f:
lines = f.readlines()
for line in lines:
dups = line.split()
print(‘Removing duplicates of {}’.format(dups[0]))
for dup in dups[1:]:
cmd = ‘rm {}’.format(dup)
os.system(cmd)