Collecting Data from the Web with Python If you like
Remove duplicate image
1. Remove invalid images
import os
import sys
import cv2
from collect_data import SUPPORTED_FORMATS
input_path = sys.argv[1]
for root, dirs, files in os.walk(input_path):
for filename in files:
ext = filename[filename.rfind(‘.’)+1:].lower()
if ext not in SUPPORTED_FORMATS:
continue
filepath = os.sep.join([root, filename])
if cv2.imread(filepath) is None:
os.system(‘rm {}’.format(filepath))
print(‘{} is not a valid image file. Deleted!’.format(filepath))
2. Remove duplicate files
FDUPES is a program for identifying duplicate files residing within specified directories.
sudo apt-get install fdupes
fdupes -rdN ./
3. Remove visually similar or duplicate images
3.1 Down scale
import os
import cv2
import sys
input_path = sys.argv[1].rstrip(os.sep)
target_short_edge = int(sys.argv[2])
for root, dirs, files in os.walk(input_path):
print(‘scanning {} …’.format(root))
for filename in files:
filepath = os.sep.join([root, filename])
img = cv2.imread(filepath)
h, w = img.shape[:2]
short_edge = min(w, h)
if short\_edge > target\_short\_edge:
scale = float(target\_short\_edge) / float(short\_edge)
new\_w = int(round(w \* scale))
new\_h = int(round(h \* scale))
print('Down sampling {} from {} x {} to {} x {} ...'.format(
filepath, w, w, new\_w, new\_h
))
img = cv2.resize(img, (new\_w, new\_h))
cv2.imwrite(filepath, img)
print(‘Done!’)
3.2 Find image dupes
sudo apt-get install findimagedupes
findimagedupes -R train > dup_list
3.3 Remove duplicates
import os
import sys
dup_list = sys.argv[1]
with open(dup_list, ‘r’) as f:
lines = f.readlines()
for line in lines:
dups = line.split()
print(‘Removing duplicates of {}’.format(dups[0]))
for dup in dups[1:]:
cmd = ‘rm {}’.format(dup)
os.system(cmd)