torchsight.evaluators.flickr32.flickrlogos.core_io
module
Convenient I/O functions.
Author
:Stefan
Romberg
,stefan.romberg
@informatik.uni
-augsburg.de
Note
:Script
was
developed
/tested
on
Windows
with
Python
2.7
$Date: 2013-11-18 13:07:55 +0100 (Mo, 18 Nov 2013) $ $Rev: 7627 $$Date: 2013-11-18 13:07:55 +0100 (Mo, 18 Nov 2013) $ $HeadURL: https://137.250.173.47:8443/svn/romberg/trunk/romberg/research/FlickrLogos-32_SDK/FlickrLogos-32_SDK-1.0.4/scripts/flickrlogos/core_io.py $ $Id: core_io.py 7627 2013-11-18 12:07:55Z romberg $
Source code
# -*- coding: utf-8 -*-
"""
Convenient I/O functions.
Author: Stefan Romberg, stefan.romberg@informatik.uni-augsburg.de
Note: Script was developed/tested on Windows with Python 2.7
$Date: 2013-11-18 13:07:55 +0100 (Mo, 18 Nov 2013) $
$Rev: 7627 $$Date: 2013-11-18 13:07:55 +0100 (Mo, 18 Nov 2013) $
$HeadURL: https://137.250.173.47:8443/svn/romberg/trunk/romberg/research/FlickrLogos-32_SDK/FlickrLogos-32_SDK-1.0.4/scripts/flickrlogos/core_io.py $
$Id: core_io.py 7627 2013-11-18 12:07:55Z romberg $
"""
import sys, random
import re
import string
import zlib
from collections import defaultdict
if sys.version_info >= (3,0,0):
from pickle import dump as pickle_dump
from pickle import load as pickle_load
else:
from cPickle import dump as pickle_dump
from cPickle import load as pickle_load
import os
from os.path import exists, basename, dirname, join, isdir, normpath, abspath, split, sep
from os import makedirs, listdir
#===============================================================================
# helper classes
#===============================================================================
class Tee(object):
"""Simulates the behaviour of the unix program 'tee' to write output
both to stdout *and* a file.
"""
def __init__(self, name, mode="w"):
self.file = None
if name is not None and name != "-":
print("Tee: Mirring stdout to file '"+name+"'")
self.file = open(name, mode)
self.stdout = sys.stdout
sys.stdout = self
def __del__(self):
if self.file is not None:
self.file.close()
sys.stdout = self.stdout
def write(self, data):
if self.file is not None:
self.file.write(data)
#self.file.flush()
self.stdout.write(data)
#===============================================================================
# helper methods
#===============================================================================
def filename(x):
"""Returns the filename without the directory part including extension."""
return split(x)[1]
def icount(it):
"""Computes the length of some data given an iterator.
Note: It consumes the iterator.
"""
for size,_ in enumerate(it):
pass
return size+1
def msplit(s, delimiters=";,\t", strip=True, remove_empty_tokens=False, strip_linebreaks=True):
"""Splits the given string by any of the given delimiters.
More sophisticated version of string.split() aka "multisplit".
Usage examples:
>>> msplit("abcd")
['abcd']
>>> msplit("a,b,c,d")
['a', 'b', 'c', 'd']
>>> msplit("a\\tb,c,d")
['a', 'b', 'c', 'd']
>>> msplit("a\\tb,c;d e")
['a', 'b', 'c', 'd e']
The parameter delimiters denotes *all* delimiters that are used to split
the string into separate tokens. Delimiters *must be* single characters.
Note: By default msplit() does not split the string at spaces.
>>> msplit("a\\tb,c;d e", delimiters=";,\\t ")
['a', 'b', 'c', 'd', 'e']
>>> msplit("a\\tb,c;d e", delimiters=";")
['a\\tb,c', 'd e']
If strip is True (default) then split tokens will further be stripped
of leading and trailing whitespaces.
Examples:
>>> msplit(" a, b , c ", strip=True)
['a', 'b', 'c']
>>> msplit(" a, b , c ", strip=False)
[' a', ' b ', ' c ']
Note that if argument delimiter contains " " (space) as delimiter argument
strip has no effect when set to True. Whitespaces are stripped from tokens
*after* the original string has been split at delimiters. That means:
>>> msplit("a b c ", delimiters=" ", strip=True)
['a', 'b', 'c', '']
If strip_linebreaks is True (default) then line breaks will be removed
before the string is split into tokens. This avoids trailing empty tokens:
Examples:
Note: strip=True swallows line breaks and so the last token will be empty:
>>> msplit("a b c d \\n", delimiters=" ", strip=True, strip_linebreaks=True)
['a', 'b', 'c', 'd', '']
>>> msplit("a b c d \\n", delimiters=" ", strip=True, strip_linebreaks=False)
['a', 'b', 'c', 'd', '']
Note: strip=False will preserve the trailing line break as extra token:
>>> msplit("a b c d \\n", delimiters=" ", strip=False, strip_linebreaks=True)
['a', 'b', 'c', 'd', '']
>>> msplit("a b c d \\n", delimiters=" ", strip=False, strip_linebreaks=False)
['a', 'b', 'c', 'd', '\\n']
If remove_empty_tokens is set to True then empty tokens are removed before
the list of tokens is returned. By default remove_empty_tokens is False.
Examples:
>>> msplit("a,,b", remove_empty_tokens=True)
['a', 'b']
>>> msplit("a,,b,", remove_empty_tokens=True)
['a', 'b']
>>> msplit("", remove_empty_tokens=True)
[]
>>> msplit(",", remove_empty_tokens=True)
[]
>>> msplit(",,,", remove_empty_tokens=True)
[]
>>> msplit("a,,b", remove_empty_tokens=False)
['a', '', 'b']
>>> msplit("a,,b,", remove_empty_tokens=False)
['a', '', 'b', '']
>>> msplit("", remove_empty_tokens=False)
[]
>>> msplit(",", remove_empty_tokens=False)
['', '']
Degenerated cases:
>>> msplit("")
[]
>>> msplit(",,,", remove_empty_tokens=False, strip_linebreaks=True)
['', '', '', '']
>>> msplit(",,,", remove_empty_tokens=False, strip_linebreaks=False)
['', '', '', '']
>>> msplit(",,,", remove_empty_tokens=True)
[]
>>> msplit(None)
Traceback (most recent call last):
File "C:\EPD-6.2\lib\doctest.py", line 1248, in __run
compileflags, 1) in test.globs
File "<doctest __main__.msplit[27]>", line 1, in <module>
msplit(5)
File "F:\research\python\csv_scripts\csv_convert2cvectorfile.py", line 139, in msplit
raise TypeError("msplit() expects string as first argument.")
TypeError: msplit() expects string as first argument.
>>> msplit(5)
Traceback (most recent call last):
File "C:\EPD-6.2\lib\doctest.py", line 1248, in __run
compileflags, 1) in test.globs
File "<doctest __main__.msplit[27]>", line 1, in <module>
msplit(5)
File "F:\research\python\csv_scripts\csv_convert2cvectorfile.py", line 139, in msplit
raise TypeError("msplit() expects string as first argument.")
TypeError: msplit() expects string as first argument.
Autor: Stefan Romberg, stefan.romberg@informatik.uni-augsburg.de
"""
if isinstance(s, bytes):
s = s.decode('utf-8')
if not isinstance(s, str):
raise TypeError("msplit() expects string as first argument.")
if s is None or len(s) == 0:
return []
if strip_linebreaks:
if s[-2:] == "\r\n":
s = s[0:len(s)-2]
else:
y = s[-1]
if y == "\n" or y == "\r":
s = s[0:len(s)-1]
delim = delimiters[0]
for d in delimiters[1:]:
s = s.replace(d, delim)
tokens = s.split(delim)
if strip:
tokens = [ t.strip() for t in tokens ]
if remove_empty_tokens:
tokens = [ t for t in tokens if len(t) > 0 ]
return tokens
#===============================================================================
#
#===============================================================================
def csv_read(filename, delimiters=",\t;", grep=None, strip_tokens=True):
"""Reads a CSV file and returns a list of list holding each data value.
* If *filename* is None or empty an exception is raised.
* If *filename* does not exist None is returned.
* If *filename* ends with ".zz" csv_read() tries to automatically read
and decompress the file content with zlib.
.. Seealso::
:func:`csv_write`
:func:`csv_iread`
"""
if filename is None or filename == "":
raise Exception("csv_read(): Cannot handle filename which is empty or None.")
if not exists(filename):
sys.stderr.write("csv_read(): File '"+filename+"' does not exist!")
return None
#TODO: exclude_columns
#TODO: exclude_rows
#TODO: convert numbers
if filename.endswith(".zz"):
# assume file content was compressed with zlib. (e.g. by csv_write)
text = read_compressed_file(filename)
lines = text.splitlines()
del text
if delimiters is not None:
data = []
for line in lines:
tokens = msplit(line, delimiters, strip=strip_tokens)
data.append( tokens )
if grep is not None:
data = [ x for x in data if x.count(grep) > 0 ]
return data
else:
if grep is not None:
lines = [ x for x in lines if x.count(grep) > 0 ]
return lines
else:
with open(filename, "rb") as f:
data = []
if delimiters is not None:
if isinstance(delimiters, list):
for line in f:
tokens = msplit(line, delimiters, strip=strip_tokens)
data.append( tokens )
elif isinstance(delimiters, str):
if strip_tokens:
for line in f:
tokens = msplit(line, delimiters, strip=strip_tokens)
data.append( tokens )
else:
for line in f:
data.append( line.split(delimiters) )
else:
raise Exception("Cannot handle delimiter type: "+str(delimiters))
else:
data = [ line.strip() for line in f ]
if grep is not None:
data = [ x for x in data if x.count(grep) > 0 ]
return data
def csv_write(list_of_list, filename, delimiter=",", compression=None, create_dirs=False):
"""Writes a list of lists to a CSV file.
Example:
>>> data_out = [ ["column1", '1', '11'], ["Test", '2', '22'] ]
>>> csv_write(data_out, "testlist.txt")
>>> os.path.exists("testlist.txt")
True
>>> data_in = csv_read("testlist.txt")
>>> data_in == data_out
True
csv_write() can also write compressed CSV files. To enable compression
set *compression* to a number in [0, 9]. The file will not be saved as
"filename" but as "filename.zz".
>>> data_out = [ ["column1", '1', '11'], ["Test", '2', '22'] ]
>>> csv_write(data_out, "testlist2.txt", compression=9)
>>> os.path.exists("testlist2.txt")
False
>>> os.path.exists("testlist2.txt.zz")
True
>>> data_in = csv_read("testlist2.txt.zz")
>>> data_in == data_out
True
If *compression* is enabled then the list is converted
to a CSV string and compressed to a file. During this operation the
whole CSV table is kept as string in memory.
.. Seealso::
:func:`csv_read`
"""
if create_dirs:
outdir = dirname(filename)
if not exists(outdir):
makedirs(outdir)
if compression is None:
with open(filename, "wb") as f:
for item in list_of_list:
if isinstance(item, str):
line = item + '\n'
else:
line = delimiter.join([ str(x) for x in item ]) + '\n'
f.write( line.encode('utf-8') )
else:
if not isinstance(compression, int):
compression = 9
lines = []
for item in list_of_list:
if isinstance(item, str):
line = item + '\n'
else:
line = delimiter.join([ str(x) for x in item ]) + '\n'
lines.append( line )
if not filename.endswith(".zz"):
filename = filename + ".zz"
write_compressed_file(filename, ''.join(lines), compression_level=compression)
#===============================================================================
# file compression
#===============================================================================
def write_compressed_file(filename, content, compression_level=9):
"""Writes the string *content* to the given file and compresses the data.
>>> data_in = "Das ist \\nein Test\\n."
>>> write_compressed_file("testfile2", data_in)
>>> data_out = read_compressed_file("testfile2")
>>> data_in == data_out
True
.. Seealso::
:func:`read_compressed_file`
:func:`csv_write`
"""
assert isinstance(content, str), "Can only write strings."
with open(filename, "wb") as f:
f.write( zlib.compress(content.encode('utf-8'), compression_level) )
def read_compressed_file(filename):
"""Reads zlib compressed strings from a file.
.. Seealso::
:func:`write_compressed_file`
:func:`csv_read`
"""
if not exists(filename) or not os.path.isfile(filename):
return None
with open(filename, "rb") as f:
return zlib.decompress( f.read() ).decode('utf-8')
#===============================================================================
# convenient wrappers for simple serialization of python objects
#===============================================================================
def savedump(data, filename, silent=False):
"""Serializes data to a file using the built-in cPickle module.
.. Seealso::
:func:`loaddump`
"""
assert filename is not None and filename != "" and filename != "-", filename
if not silent:
print("savedump(): Saving data to '"+filename+"'..."),
with open(filename, "wb") as f:
pickle_dump(data, f, protocol=2)
if not silent:
print("Done")
def loaddump(filename, silent=False):
"""Unserializes data from a file that was written with
the built-in cPickle module.
.. Seealso::
:func:`savedump`
"""
assert filename is not None and filename != "" and filename != "-", filename
if not exists(filename):
if not silent:
print("loaddump(): File '"+filename+"' does not exist. Returning None.")
return None
try:
if not silent:
print("loaddump(): Loading data from '"+filename+"'..."),
with open(filename, "rb") as f:
data = pickle_load(f)
if not silent:
print("Done")
return data
except Exception as ex:
print("ERROR: loaddump(): Could not load data from '"+filename+"'.")
print(" Passing exception to caller.")
print(str(ex))
raise ex
#===============================================================================
#
#===============================================================================
def exclude_dot_files(filelist):
"""Removes all files starting with '.' from a filelist."""
return [ x for x in filelist if not x.startswith('.') ]
def is_image_file(filename):
"""Determines if filename indicates that the file is an image.
Returns true if the filename ends with (case insensitive) one of the
extensions '.jpg','.jpeg','.pgm','.png','.tif','.tiff','.gif','.bmp' and
it does not start with '.'.
(This hides the ._* files produced by Mac OS X aka "Apple Double files")
"""
# assume no image file starts with '.': Hides ._* files produced by Mac OS X.
if filename is None or filename == "" or filename.startswith('.'):
return False
x = filename.lower()
return ( x.endswith(".jpg") or
x.endswith(".jpeg") or
x.endswith(".pgm") or
x.endswith(".png") or
x.endswith(".tiff") or
x.endswith(".tif") or
x.endswith(".gif") or
x.endswith(".bmp") )
def clean_string(s):
return re.sub('[%s]' % (string.punctuation+string.digits), '', s).lower()
def get_all_image_files(dir_path):
"""Returns a list of all image files within *dir_path*."""
assert dir_path is not None
files = listdir(dir_path)
images = sorted(exclude_dot_files(files))
imgFiles = [ ''.join( (dir_path, sep, x) ) for x in images if is_image_file(x) ]
return imgFiles
def get_classes_by_filenames(image_directory):
assert image_directory is not None
oldcategory = ""
images = sorted(exclude_dot_files(listdir(image_directory)))
classes = dict()
for img in images:
if not is_image_file(img):
continue
category = clean_string(basename(img)[0:-4])
if category == oldcategory:
continue
oldcategory = category
print("class: "+category)
imgFiles = [ item for item in images if clean_string(basename(item)[0:-4]) == category ]
classes[category] = imgFiles
return classes
def get_classes_by_dirnames(base_directory):
"""Returns all image files below base_directory and their associated class.
<base_directory>/class1/1.jpg
<base_directory>/class1/2.jpg
<base_directory>/class2/3.jpg
<base_directory>/class2/4.jpg
will yield a dictionary:
"class1" -> [1.jpg, 2.jpg]
"class2" -> [3.jpg, 4.jpg]
"""
assert base_directory is not None
directories = sorted(exclude_dot_files(listdir(base_directory)))
classes = dict()
for item in directories:
dir_path = base_directory + sep + item
if not isdir(dir_path):
print("Warning: item '"+str(dir_path)+"' is not a directory. Skipping...")
continue
category = item
print("class: "+category)
imgFiles = get_all_image_files(dir_path)
classes[category] = imgFiles
return classes
def wildcards2regex(wildcard_pattern):
"""Converts a wildcard pattern such as "*.txt" to a regular expression."""
p = wildcard_pattern
p = p.replace("\\", "\\\\")
p = p.replace(".", "\\.")
p = p.replace("^", "\\^")
p = p.replace("$", "\\$")
p = p.replace("+", "\\+")
p = p.replace("-", "\\-")
p = p.replace("=", "\\=") # ?
p = p.replace(",", "\\,") # ?
p = p.replace("(", "\\(")
p = p.replace(")", "\\)")
p = p.replace("[", "\\[")
p = p.replace("]", "\\]")
p = p.replace("{", "\\{")
p = p.replace("}", "\\}")
p = p.replace("/", "\/}") # ?
p = p.replace("*", ".*")
p = p.replace("?", ".")
return "^" + p + "$"
def grab_files(directory, wildcard_pattern=None, regex_pattern=None,
topdown=True, followlinks=False):
"""Returns all files within directory AND its subdirectories that match
the given pattern. Either wildcards or a regular expression can be used
as pattern.
If *followlinks* is set to True, symlinks are followed when traversing
the directory. If *topdown* is set to False, subdirectories are traversed
bottom-up.
@author: Stefan Romberg
"""
assert directory is not None and directory != ""
assert ( ( wildcard_pattern is not None and regex_pattern is None ) or
( wildcard_pattern is None and regex_pattern is not None ) )
# prepare regex engine
if wildcard_pattern is not None:
regex_pattern = wildcards2regex(normpath(wildcard_pattern))
rec = re.compile(regex_pattern)
search = rec.search
# traverse directories
matched_files = []
for dir_path, dirnames, filenames in os.walk(directory,
topdown=topdown,
followlinks=followlinks):
for filename in filenames:
f_name = normpath(join(dir_path, filename))
m = search(f_name)
if m is not None:
assert m.start() == 0 and m.end() == len(f_name), (m.start(), m.end(), f_name)
matched_files.append( f_name )
return matched_files
def grab_files2(directory, suffix, topdown=True, followlinks=False):
"""Returns all files within directory AND its subdirectories that end
with *suffix*.
If *followlinks* is set to True, symlinks are followed when traversing
the directory. If *topdown* is set to False, subdirectories are traversed
bottom-up.
@author: Stefan Romberg
"""
assert directory is not None and directory != ""
# traverse directories
matched_files = []
for dir_path, dirnames, filenames in os.walk(directory,
topdown=topdown,
followlinks=followlinks):
print("grab_files2(): Processing dir: "+dir_path)
for filename in filenames:
if filename.endswith(suffix):
matched_files.append( join(dir_path, filename) )
return matched_files
#TODO: move method somewhere appropriate as it deals with a specific directory layout
def get_files_per_class_by_pattern(dir_path, classes, pattern):
x = defaultdict(set)
for c in classes:
subdir = abspath(dir_path) + '/' + c + '/'
files = grab_files(normpath(subdir), pattern)
x[c] = files
for f in files:
assert exists(f), f
return x
if __name__ == '__main__':
#===========================================================================
# TESTS
#===========================================================================
import doctest
doctest.testmod()
print("All doctests passed.")
Functions
def clean_string(s)
-
Source code
def clean_string(s): return re.sub('[%s]' % (string.punctuation+string.digits), '', s).lower()
def csv_read(filename, delimiters=',\t;', grep=None, strip_tokens=True)
-
Reads a CSV file and returns a list of list holding each data value.
- If filename is None or empty an exception is raised.
- If filename does not exist None is returned.
- If filename ends with ".zz" csv_read() tries to automatically read and decompress the file content with zlib.
Seealso
:func:
csv_write()
:func:csv_iread
Source code
def csv_read(filename, delimiters=",\t;", grep=None, strip_tokens=True): """Reads a CSV file and returns a list of list holding each data value. * If *filename* is None or empty an exception is raised. * If *filename* does not exist None is returned. * If *filename* ends with ".zz" csv_read() tries to automatically read and decompress the file content with zlib. .. Seealso:: :func:`csv_write` :func:`csv_iread` """ if filename is None or filename == "": raise Exception("csv_read(): Cannot handle filename which is empty or None.") if not exists(filename): sys.stderr.write("csv_read(): File '"+filename+"' does not exist!") return None #TODO: exclude_columns #TODO: exclude_rows #TODO: convert numbers if filename.endswith(".zz"): # assume file content was compressed with zlib. (e.g. by csv_write) text = read_compressed_file(filename) lines = text.splitlines() del text if delimiters is not None: data = [] for line in lines: tokens = msplit(line, delimiters, strip=strip_tokens) data.append( tokens ) if grep is not None: data = [ x for x in data if x.count(grep) > 0 ] return data else: if grep is not None: lines = [ x for x in lines if x.count(grep) > 0 ] return lines else: with open(filename, "rb") as f: data = [] if delimiters is not None: if isinstance(delimiters, list): for line in f: tokens = msplit(line, delimiters, strip=strip_tokens) data.append( tokens ) elif isinstance(delimiters, str): if strip_tokens: for line in f: tokens = msplit(line, delimiters, strip=strip_tokens) data.append( tokens ) else: for line in f: data.append( line.split(delimiters) ) else: raise Exception("Cannot handle delimiter type: "+str(delimiters)) else: data = [ line.strip() for line in f ] if grep is not None: data = [ x for x in data if x.count(grep) > 0 ] return data
def csv_write(list_of_list, filename, delimiter=',', compression=None, create_dirs=False)
-
Writes a list of lists to a CSV file.
Example:
>>> data_out = [ ["column1", '1', '11'], ["Test", '2', '22'] ] >>> csv_write(data_out, "testlist.txt") >>> os.path.exists("testlist.txt") True >>> data_in = csv_read("testlist.txt") >>> data_in == data_out True
csv_write() can also write compressed CSV files. To enable compression set compression to a number in [0, 9]. The file will not be saved as "filename" but as "filename.zz".
>>> data_out = [ ["column1", '1', '11'], ["Test", '2', '22'] ] >>> csv_write(data_out, "testlist2.txt", compression=9) >>> os.path.exists("testlist2.txt") False >>> os.path.exists("testlist2.txt.zz") True >>> data_in = csv_read("testlist2.txt.zz") >>> data_in == data_out True
If compression is enabled then the list is converted to a CSV string and compressed to a file. During this operation the whole CSV table is kept as string in memory.
Seealso
:func:
csv_read()
Source code
def csv_write(list_of_list, filename, delimiter=",", compression=None, create_dirs=False): """Writes a list of lists to a CSV file. Example: >>> data_out = [ ["column1", '1', '11'], ["Test", '2', '22'] ] >>> csv_write(data_out, "testlist.txt") >>> os.path.exists("testlist.txt") True >>> data_in = csv_read("testlist.txt") >>> data_in == data_out True csv_write() can also write compressed CSV files. To enable compression set *compression* to a number in [0, 9]. The file will not be saved as "filename" but as "filename.zz". >>> data_out = [ ["column1", '1', '11'], ["Test", '2', '22'] ] >>> csv_write(data_out, "testlist2.txt", compression=9) >>> os.path.exists("testlist2.txt") False >>> os.path.exists("testlist2.txt.zz") True >>> data_in = csv_read("testlist2.txt.zz") >>> data_in == data_out True If *compression* is enabled then the list is converted to a CSV string and compressed to a file. During this operation the whole CSV table is kept as string in memory. .. Seealso:: :func:`csv_read` """ if create_dirs: outdir = dirname(filename) if not exists(outdir): makedirs(outdir) if compression is None: with open(filename, "wb") as f: for item in list_of_list: if isinstance(item, str): line = item + '\n' else: line = delimiter.join([ str(x) for x in item ]) + '\n' f.write( line.encode('utf-8') ) else: if not isinstance(compression, int): compression = 9 lines = [] for item in list_of_list: if isinstance(item, str): line = item + '\n' else: line = delimiter.join([ str(x) for x in item ]) + '\n' lines.append( line ) if not filename.endswith(".zz"): filename = filename + ".zz" write_compressed_file(filename, ''.join(lines), compression_level=compression)
def exclude_dot_files(filelist)
-
Removes all files starting with '.' from a filelist.
Source code
def exclude_dot_files(filelist): """Removes all files starting with '.' from a filelist.""" return [ x for x in filelist if not x.startswith('.') ]
def filename(x)
-
Returns the filename without the directory part including extension.
Source code
def filename(x): """Returns the filename without the directory part including extension.""" return split(x)[1]
def get_all_image_files(dir_path)
-
Returns a list of all image files within dir_path.
Source code
def get_all_image_files(dir_path): """Returns a list of all image files within *dir_path*.""" assert dir_path is not None files = listdir(dir_path) images = sorted(exclude_dot_files(files)) imgFiles = [ ''.join( (dir_path, sep, x) ) for x in images if is_image_file(x) ] return imgFiles
def get_classes_by_dirnames(base_directory)
-
Returns all image files below base_directory and their associated class.
/class1/1.jpg /class1/2.jpg /class2/3.jpg /class2/4.jpg will yield a dictionary: "class1" -> [1.jpg, 2.jpg] "class2" -> [3.jpg, 4.jpg]
Source code
def get_classes_by_dirnames(base_directory): """Returns all image files below base_directory and their associated class. <base_directory>/class1/1.jpg <base_directory>/class1/2.jpg <base_directory>/class2/3.jpg <base_directory>/class2/4.jpg will yield a dictionary: "class1" -> [1.jpg, 2.jpg] "class2" -> [3.jpg, 4.jpg] """ assert base_directory is not None directories = sorted(exclude_dot_files(listdir(base_directory))) classes = dict() for item in directories: dir_path = base_directory + sep + item if not isdir(dir_path): print("Warning: item '"+str(dir_path)+"' is not a directory. Skipping...") continue category = item print("class: "+category) imgFiles = get_all_image_files(dir_path) classes[category] = imgFiles return classes
def get_classes_by_filenames(image_directory)
-
Source code
def get_classes_by_filenames(image_directory): assert image_directory is not None oldcategory = "" images = sorted(exclude_dot_files(listdir(image_directory))) classes = dict() for img in images: if not is_image_file(img): continue category = clean_string(basename(img)[0:-4]) if category == oldcategory: continue oldcategory = category print("class: "+category) imgFiles = [ item for item in images if clean_string(basename(item)[0:-4]) == category ] classes[category] = imgFiles return classes
def get_files_per_class_by_pattern(dir_path, classes, pattern)
-
Source code
def get_files_per_class_by_pattern(dir_path, classes, pattern): x = defaultdict(set) for c in classes: subdir = abspath(dir_path) + '/' + c + '/' files = grab_files(normpath(subdir), pattern) x[c] = files for f in files: assert exists(f), f return x
def grab_files(directory, wildcard_pattern=None, regex_pattern=None, topdown=True, followlinks=False)
-
Returns all files within directory AND its subdirectories that match the given pattern. Either wildcards or a regular expression can be used as pattern.
If followlinks is set to True, symlinks are followed when traversing the directory. If topdown is set to False, subdirectories are traversed bottom-up.
@author: Stefan Romberg
Source code
def grab_files(directory, wildcard_pattern=None, regex_pattern=None, topdown=True, followlinks=False): """Returns all files within directory AND its subdirectories that match the given pattern. Either wildcards or a regular expression can be used as pattern. If *followlinks* is set to True, symlinks are followed when traversing the directory. If *topdown* is set to False, subdirectories are traversed bottom-up. @author: Stefan Romberg """ assert directory is not None and directory != "" assert ( ( wildcard_pattern is not None and regex_pattern is None ) or ( wildcard_pattern is None and regex_pattern is not None ) ) # prepare regex engine if wildcard_pattern is not None: regex_pattern = wildcards2regex(normpath(wildcard_pattern)) rec = re.compile(regex_pattern) search = rec.search # traverse directories matched_files = [] for dir_path, dirnames, filenames in os.walk(directory, topdown=topdown, followlinks=followlinks): for filename in filenames: f_name = normpath(join(dir_path, filename)) m = search(f_name) if m is not None: assert m.start() == 0 and m.end() == len(f_name), (m.start(), m.end(), f_name) matched_files.append( f_name ) return matched_files
def grab_files2(directory, suffix, topdown=True, followlinks=False)
-
Returns all files within directory AND its subdirectories that end with suffix.
If followlinks is set to True, symlinks are followed when traversing the directory. If topdown is set to False, subdirectories are traversed bottom-up.
@author: Stefan Romberg
Source code
def grab_files2(directory, suffix, topdown=True, followlinks=False): """Returns all files within directory AND its subdirectories that end with *suffix*. If *followlinks* is set to True, symlinks are followed when traversing the directory. If *topdown* is set to False, subdirectories are traversed bottom-up. @author: Stefan Romberg """ assert directory is not None and directory != "" # traverse directories matched_files = [] for dir_path, dirnames, filenames in os.walk(directory, topdown=topdown, followlinks=followlinks): print("grab_files2(): Processing dir: "+dir_path) for filename in filenames: if filename.endswith(suffix): matched_files.append( join(dir_path, filename) ) return matched_files
def icount(it)
-
Computes the length of some data given an iterator. Note: It consumes the iterator.
Source code
def icount(it): """Computes the length of some data given an iterator. Note: It consumes the iterator. """ for size,_ in enumerate(it): pass return size+1
def is_image_file(filename)
-
Determines if filename indicates that the file is an image.
Returns true if the filename ends with (case insensitive) one of the extensions '.jpg','.jpeg','.pgm','.png','.tif','.tiff','.gif','.bmp' and it does not start with '.'. (This hides the ._* files produced by Mac OS X aka "Apple Double files")
Source code
def is_image_file(filename): """Determines if filename indicates that the file is an image. Returns true if the filename ends with (case insensitive) one of the extensions '.jpg','.jpeg','.pgm','.png','.tif','.tiff','.gif','.bmp' and it does not start with '.'. (This hides the ._* files produced by Mac OS X aka "Apple Double files") """ # assume no image file starts with '.': Hides ._* files produced by Mac OS X. if filename is None or filename == "" or filename.startswith('.'): return False x = filename.lower() return ( x.endswith(".jpg") or x.endswith(".jpeg") or x.endswith(".pgm") or x.endswith(".png") or x.endswith(".tiff") or x.endswith(".tif") or x.endswith(".gif") or x.endswith(".bmp") )
def loaddump(filename, silent=False)
-
Unserializes data from a file that was written with the built-in cPickle module.
Seealso
:func:
savedump()
Source code
def loaddump(filename, silent=False): """Unserializes data from a file that was written with the built-in cPickle module. .. Seealso:: :func:`savedump` """ assert filename is not None and filename != "" and filename != "-", filename if not exists(filename): if not silent: print("loaddump(): File '"+filename+"' does not exist. Returning None.") return None try: if not silent: print("loaddump(): Loading data from '"+filename+"'..."), with open(filename, "rb") as f: data = pickle_load(f) if not silent: print("Done") return data except Exception as ex: print("ERROR: loaddump(): Could not load data from '"+filename+"'.") print(" Passing exception to caller.") print(str(ex)) raise ex
def msplit(s, delimiters=';,\t', strip=True, remove_empty_tokens=False, strip_linebreaks=True)
-
Splits the given string by any of the given delimiters. More sophisticated version of string.split() aka "multisplit".
Usage examples:
>>> msplit("abcd") ['abcd'] >>> msplit("a,b,c,d") ['a', 'b', 'c', 'd'] >>> msplit("a\tb,c,d") ['a', 'b', 'c', 'd'] >>> msplit("a\tb,c;d e") ['a', 'b', 'c', 'd e']
The parameter delimiters denotes all delimiters that are used to split the string into separate tokens. Delimiters must be single characters. Note: By default msplit() does not split the string at spaces.
>>> msplit("a\tb,c;d e", delimiters=";,\t ") ['a', 'b', 'c', 'd', 'e'] >>> msplit("a\tb,c;d e", delimiters=";") ['a\tb,c', 'd e']
If strip is True (default) then split tokens will further be stripped of leading and trailing whitespaces. Examples:
>>> msplit(" a, b , c ", strip=True) ['a', 'b', 'c'] >>> msplit(" a, b , c ", strip=False) [' a', ' b ', ' c ']
Note that if argument delimiter contains " " (space) as delimiter argument strip has no effect when set to True. Whitespaces are stripped from tokens after the original string has been split at delimiters. That means:
>>> msplit("a b c ", delimiters=" ", strip=True) ['a', 'b', 'c', '']
If strip_linebreaks is True (default) then line breaks will be removed before the string is split into tokens. This avoids trailing empty tokens: Examples:
Note
:strip
=True
swallows
line
breaks
and
so
the
last
token
will
be
empty
:-
msplit("a b c d \n", delimiters=" ", strip=True, strip_linebreaks=True) ['a', 'b', 'c', 'd', ''] msplit("a b c d \n", delimiters=" ", strip=True, strip_linebreaks=False) ['a', 'b', 'c', 'd', '']
Note
:strip
=False
will
preserve
the
trailing
line
break
as
extra
token
:-
msplit("a b c d \n", delimiters=" ", strip=False, strip_linebreaks=True) ['a', 'b', 'c', 'd', ''] msplit("a b c d \n", delimiters=" ", strip=False, strip_linebreaks=False) ['a', 'b', 'c', 'd', '\n']
If remove_empty_tokens is set to True then empty tokens are removed before the list of tokens is returned. By default remove_empty_tokens is False. Examples:
>>> msplit("a,,b", remove_empty_tokens=True) ['a', 'b'] >>> msplit("a,,b,", remove_empty_tokens=True) ['a', 'b'] >>> msplit("", remove_empty_tokens=True) [] >>> msplit(",", remove_empty_tokens=True) [] >>> msplit(",,,", remove_empty_tokens=True) [] >>> msplit("a,,b", remove_empty_tokens=False) ['a', '', 'b'] >>> msplit("a,,b,", remove_empty_tokens=False) ['a', '', 'b', ''] >>> msplit("", remove_empty_tokens=False) [] >>> msplit(",", remove_empty_tokens=False) ['', '']
Degenerated cases:
>>> msplit("") [] >>> msplit(",,,", remove_empty_tokens=False, strip_linebreaks=True) ['', '', '', ''] >>> msplit(",,,", remove_empty_tokens=False, strip_linebreaks=False) ['', '', '', ''] >>> msplit(",,,", remove_empty_tokens=True) [] >>> msplit(None) Traceback (most recent call last):
File "C:\EPD-6.2\lib\doctest.py", line 1248, in __run compileflags, 1) in test.globs File "
", line 1, in msplit(5) File "F: esearch\python\csv_scripts\csv_convert2cvectorfile.py", line 139, in msplit raise TypeError("msplit() expects string as first argument.") TypeError: msplit() expects string as first argument. >>> msplit(5) Traceback (most recent call last):
File "C:\EPD-6.2\lib\doctest.py", line 1248, in __run compileflags, 1) in test.globs File "
", line 1, in msplit(5) File "F: esearch\python\csv_scripts\csv_convert2cvectorfile.py", line 139, in msplit raise TypeError("msplit() expects string as first argument.") TypeError: msplit() expects string as first argument. Autor
:Stefan
Romberg
,stefan.romberg
@informatik.uni
-augsburg.de
Source code
def msplit(s, delimiters=";,\t", strip=True, remove_empty_tokens=False, strip_linebreaks=True): """Splits the given string by any of the given delimiters. More sophisticated version of string.split() aka "multisplit". Usage examples: >>> msplit("abcd") ['abcd'] >>> msplit("a,b,c,d") ['a', 'b', 'c', 'd'] >>> msplit("a\\tb,c,d") ['a', 'b', 'c', 'd'] >>> msplit("a\\tb,c;d e") ['a', 'b', 'c', 'd e'] The parameter delimiters denotes *all* delimiters that are used to split the string into separate tokens. Delimiters *must be* single characters. Note: By default msplit() does not split the string at spaces. >>> msplit("a\\tb,c;d e", delimiters=";,\\t ") ['a', 'b', 'c', 'd', 'e'] >>> msplit("a\\tb,c;d e", delimiters=";") ['a\\tb,c', 'd e'] If strip is True (default) then split tokens will further be stripped of leading and trailing whitespaces. Examples: >>> msplit(" a, b , c ", strip=True) ['a', 'b', 'c'] >>> msplit(" a, b , c ", strip=False) [' a', ' b ', ' c '] Note that if argument delimiter contains " " (space) as delimiter argument strip has no effect when set to True. Whitespaces are stripped from tokens *after* the original string has been split at delimiters. That means: >>> msplit("a b c ", delimiters=" ", strip=True) ['a', 'b', 'c', ''] If strip_linebreaks is True (default) then line breaks will be removed before the string is split into tokens. This avoids trailing empty tokens: Examples: Note: strip=True swallows line breaks and so the last token will be empty: >>> msplit("a b c d \\n", delimiters=" ", strip=True, strip_linebreaks=True) ['a', 'b', 'c', 'd', ''] >>> msplit("a b c d \\n", delimiters=" ", strip=True, strip_linebreaks=False) ['a', 'b', 'c', 'd', ''] Note: strip=False will preserve the trailing line break as extra token: >>> msplit("a b c d \\n", delimiters=" ", strip=False, strip_linebreaks=True) ['a', 'b', 'c', 'd', ''] >>> msplit("a b c d \\n", delimiters=" ", strip=False, strip_linebreaks=False) ['a', 'b', 'c', 'd', '\\n'] If remove_empty_tokens is set to True then empty tokens are removed before the list of tokens is returned. By default remove_empty_tokens is False. Examples: >>> msplit("a,,b", remove_empty_tokens=True) ['a', 'b'] >>> msplit("a,,b,", remove_empty_tokens=True) ['a', 'b'] >>> msplit("", remove_empty_tokens=True) [] >>> msplit(",", remove_empty_tokens=True) [] >>> msplit(",,,", remove_empty_tokens=True) [] >>> msplit("a,,b", remove_empty_tokens=False) ['a', '', 'b'] >>> msplit("a,,b,", remove_empty_tokens=False) ['a', '', 'b', ''] >>> msplit("", remove_empty_tokens=False) [] >>> msplit(",", remove_empty_tokens=False) ['', ''] Degenerated cases: >>> msplit("") [] >>> msplit(",,,", remove_empty_tokens=False, strip_linebreaks=True) ['', '', '', ''] >>> msplit(",,,", remove_empty_tokens=False, strip_linebreaks=False) ['', '', '', ''] >>> msplit(",,,", remove_empty_tokens=True) [] >>> msplit(None) Traceback (most recent call last): File "C:\EPD-6.2\lib\doctest.py", line 1248, in __run compileflags, 1) in test.globs File "<doctest __main__.msplit[27]>", line 1, in <module> msplit(5) File "F:\research\python\csv_scripts\csv_convert2cvectorfile.py", line 139, in msplit raise TypeError("msplit() expects string as first argument.") TypeError: msplit() expects string as first argument. >>> msplit(5) Traceback (most recent call last): File "C:\EPD-6.2\lib\doctest.py", line 1248, in __run compileflags, 1) in test.globs File "<doctest __main__.msplit[27]>", line 1, in <module> msplit(5) File "F:\research\python\csv_scripts\csv_convert2cvectorfile.py", line 139, in msplit raise TypeError("msplit() expects string as first argument.") TypeError: msplit() expects string as first argument. Autor: Stefan Romberg, stefan.romberg@informatik.uni-augsburg.de """ if isinstance(s, bytes): s = s.decode('utf-8') if not isinstance(s, str): raise TypeError("msplit() expects string as first argument.") if s is None or len(s) == 0: return [] if strip_linebreaks: if s[-2:] == "\r\n": s = s[0:len(s)-2] else: y = s[-1] if y == "\n" or y == "\r": s = s[0:len(s)-1] delim = delimiters[0] for d in delimiters[1:]: s = s.replace(d, delim) tokens = s.split(delim) if strip: tokens = [ t.strip() for t in tokens ] if remove_empty_tokens: tokens = [ t for t in tokens if len(t) > 0 ] return tokens
def read_compressed_file(filename)
-
Reads zlib compressed strings from a file.
Seealso
:func:
write_compressed_file()
:func:csv_read()
Source code
def read_compressed_file(filename): """Reads zlib compressed strings from a file. .. Seealso:: :func:`write_compressed_file` :func:`csv_read` """ if not exists(filename) or not os.path.isfile(filename): return None with open(filename, "rb") as f: return zlib.decompress( f.read() ).decode('utf-8')
def savedump(data, filename, silent=False)
-
Serializes data to a file using the built-in cPickle module.
Seealso
:func:
loaddump()
Source code
def savedump(data, filename, silent=False): """Serializes data to a file using the built-in cPickle module. .. Seealso:: :func:`loaddump` """ assert filename is not None and filename != "" and filename != "-", filename if not silent: print("savedump(): Saving data to '"+filename+"'..."), with open(filename, "wb") as f: pickle_dump(data, f, protocol=2) if not silent: print("Done")
def wildcards2regex(wildcard_pattern)
-
Converts a wildcard pattern such as "*.txt" to a regular expression.
Source code
def wildcards2regex(wildcard_pattern): """Converts a wildcard pattern such as "*.txt" to a regular expression.""" p = wildcard_pattern p = p.replace("\\", "\\\\") p = p.replace(".", "\\.") p = p.replace("^", "\\^") p = p.replace("$", "\\$") p = p.replace("+", "\\+") p = p.replace("-", "\\-") p = p.replace("=", "\\=") # ? p = p.replace(",", "\\,") # ? p = p.replace("(", "\\(") p = p.replace(")", "\\)") p = p.replace("[", "\\[") p = p.replace("]", "\\]") p = p.replace("{", "\\{") p = p.replace("}", "\\}") p = p.replace("/", "\/}") # ? p = p.replace("*", ".*") p = p.replace("?", ".") return "^" + p + "$"
def write_compressed_file(filename, content, compression_level=9)
-
Writes the string content to the given file and compresses the data.
>>> data_in = "Das ist \nein Test\n." >>> write_compressed_file("testfile2", data_in) >>> data_out = read_compressed_file("testfile2") >>> data_in == data_out True
Seealso
:func:
read_compressed_file()
:func:csv_write()
Source code
def write_compressed_file(filename, content, compression_level=9): """Writes the string *content* to the given file and compresses the data. >>> data_in = "Das ist \\nein Test\\n." >>> write_compressed_file("testfile2", data_in) >>> data_out = read_compressed_file("testfile2") >>> data_in == data_out True .. Seealso:: :func:`read_compressed_file` :func:`csv_write` """ assert isinstance(content, str), "Can only write strings." with open(filename, "wb") as f: f.write( zlib.compress(content.encode('utf-8'), compression_level) )
Classes
class Tee
-
Simulates the behaviour of the unix program 'tee' to write output both to stdout and a file.
Source code
class Tee(object): """Simulates the behaviour of the unix program 'tee' to write output both to stdout *and* a file. """ def __init__(self, name, mode="w"): self.file = None if name is not None and name != "-": print("Tee: Mirring stdout to file '"+name+"'") self.file = open(name, mode) self.stdout = sys.stdout sys.stdout = self def __del__(self): if self.file is not None: self.file.close() sys.stdout = self.stdout def write(self, data): if self.file is not None: self.file.write(data) #self.file.flush() self.stdout.write(data)
Methods
def __init__(self, name, mode='w')
-
Initialize self. See help(type(self)) for accurate signature.
Source code
def __init__(self, name, mode="w"): self.file = None if name is not None and name != "-": print("Tee: Mirring stdout to file '"+name+"'") self.file = open(name, mode) self.stdout = sys.stdout sys.stdout = self
def write(self, data)
-
Source code
def write(self, data): if self.file is not None: self.file.write(data) #self.file.flush() self.stdout.write(data)