diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 982b5de49e6fa..8ec3adcdffd6f 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -245,6 +245,7 @@ def wrapper(cls): class _OrderedDict(dict): + """Dictionary that remembers insertion order""" # An inherited dict maps keys to values. # The inherited dict provides __getitem__, __len__, __contains__, and get. @@ -505,6 +506,7 @@ def viewitems(self): class _Counter(dict): + """Dict subclass for counting hashable objects. Sometimes called a bag or multiset. Elements are stored as dictionary keys and their counts are stored as dictionary values. diff --git a/pandas/computation/engines.py b/pandas/computation/engines.py index 88efc9eeab5d5..9738cac58fb2d 100644 --- a/pandas/computation/engines.py +++ b/pandas/computation/engines.py @@ -10,6 +10,7 @@ class AbstractEngine(object): + """Object serving as a base class for all engines.""" __metaclass__ = abc.ABCMeta @@ -73,6 +74,7 @@ def _evaluate(self): class NumExprEngine(AbstractEngine): + """NumExpr engine class""" has_neg_frac = True @@ -105,6 +107,7 @@ def _evaluate(self): class PythonEngine(AbstractEngine): + """Evaluate an expression in Python space. Mostly for testing purposes. diff --git a/pandas/computation/expr.py b/pandas/computation/expr.py index 1af41acd34ede..4c6ea3ecdae7d 100644 --- a/pandas/computation/expr.py +++ b/pandas/computation/expr.py @@ -70,6 +70,7 @@ def _raw_hex_id(obj, pad_size=2): class Scope(StringMixin): + """Object to hold scope, with a few bells to deal with some custom syntax added by pandas. @@ -324,6 +325,7 @@ def _node_not_implemented(node_name, cls): """Return a function that raises a NotImplementedError with a passed node name. """ + def f(self, *args, **kwargs): raise NotImplementedError("{0!r} nodes are not " "implemented".format(node_name)) @@ -356,6 +358,7 @@ def _op_maker(op_class, op_symbol): ------- f : callable """ + def f(self, node, *args, **kwargs): """Return a partial function with an Op subclass with an operator already passed. @@ -389,6 +392,7 @@ def f(cls): @disallow(_unsupported_nodes) @add_ops(_op_classes) class BaseExprVisitor(ast.NodeVisitor): + """Custom ast walker. Parsers of other engines should subclass this class if necessary. @@ -710,6 +714,7 @@ def visitor(x, y): (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn', 'Tuple']))) class PandasExprVisitor(BaseExprVisitor): + def __init__(self, env, engine, parser, preparser=lambda x: _replace_locals(_replace_booleans(x))): super(PandasExprVisitor, self).__init__(env, engine, parser, preparser) @@ -717,12 +722,14 @@ def __init__(self, env, engine, parser, @disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not'])) class PythonExprVisitor(BaseExprVisitor): + def __init__(self, env, engine, parser, preparser=lambda x: x): super(PythonExprVisitor, self).__init__(env, engine, parser, preparser=preparser) class Expr(StringMixin): + """Object encapsulating an expression. Parameters @@ -734,6 +741,7 @@ class Expr(StringMixin): truediv : bool, optional, default True level : int, optional, default 2 """ + def __init__(self, expr, engine='numexpr', parser='pandas', env=None, truediv=True, level=2): self.expr = expr diff --git a/pandas/computation/ops.py b/pandas/computation/ops.py index 0510ee86760a3..8d7bd0a819e79 100644 --- a/pandas/computation/ops.py +++ b/pandas/computation/ops.py @@ -27,7 +27,9 @@ class UndefinedVariableError(NameError): + """NameError subclass for local variables.""" + def __init__(self, *args): msg = 'name {0!r} is not defined' subbed = _TAG_RE.sub('', args[0]) @@ -51,6 +53,7 @@ def _possibly_update_key(d, value, old_key, new_key=None): class Term(StringMixin): + def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, string_types) else cls supr_new = super(Term, klass).__new__ @@ -195,6 +198,7 @@ def ndim(self): class Constant(Term): + def __init__(self, value, env, side=None, encoding=None): super(Constant, self).__init__(value, env, side=side, encoding=encoding) @@ -211,8 +215,10 @@ def name(self): class Op(StringMixin): + """Hold an operator of unknown arity """ + def __init__(self, op, operands, *args, **kwargs): self.op = _bool_op_map.get(op, op) self.operands = operands @@ -328,6 +334,7 @@ def is_term(obj): class BinOp(Op): + """Hold a binary operator and its operands Parameters @@ -336,6 +343,7 @@ class BinOp(Op): left : Term or Op right : Term or Op """ + def __init__(self, op, lhs, rhs, **kwargs): super(BinOp, self).__init__(op, (lhs, rhs)) self.lhs = lhs @@ -452,6 +460,7 @@ def _disallow_scalar_only_bool_ops(self): class Div(BinOp): + """Div operator to special case casting. Parameters @@ -462,6 +471,7 @@ class Div(BinOp): Whether or not to use true division. With Python 3 this happens regardless of the value of ``truediv``. """ + def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): super(Div, self).__init__('/', lhs, rhs, *args, **kwargs) @@ -475,6 +485,7 @@ def __init__(self, lhs, rhs, truediv=True, *args, **kwargs): class UnaryOp(Op): + """Hold a unary operator and its operands Parameters @@ -489,6 +500,7 @@ class UnaryOp(Op): ValueError * If no function associated with the passed operator token is found. """ + def __init__(self, op, operand): super(UnaryOp, self).__init__(op, (operand,)) self.operand = operand diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py index 8afe8e909a434..a521bfb3cfec9 100644 --- a/pandas/computation/pytables.py +++ b/pandas/computation/pytables.py @@ -30,6 +30,7 @@ def __init__(self, gbls=None, lcls=None, queryables=None, level=1): class Term(ops.Term): + def __new__(cls, name, env, side=None, encoding=None): klass = Constant if not isinstance(name, string_types) else cls supr_new = StringMixin.__new__ @@ -57,6 +58,7 @@ def value(self): class Constant(Term): + def __init__(self, value, env, side=None, encoding=None): super(Constant, self).__init__(value, env, side=side, encoding=encoding) @@ -292,9 +294,9 @@ def __unicode__(self): def invert(self): """ invert the condition """ - #if self.condition is not None: + # if self.condition is not None: # self.condition = "~(%s)" % self.condition - #return self + # return self raise NotImplementedError("cannot use an invert condition when " "passing to numexpr") diff --git a/pandas/computation/tests/test_eval.py b/pandas/computation/tests/test_eval.py index 44e560b86a683..f2d75d3fd21c5 100644 --- a/pandas/computation/tests/test_eval.py +++ b/pandas/computation/tests/test_eval.py @@ -35,6 +35,7 @@ _series_frame_incompatible = _bool_ops_syms _scalar_skip = 'in', 'not in' + def skip_if_no_ne(engine='numexpr'): if not _USE_NUMEXPR and engine == 'numexpr': raise nose.SkipTest("numexpr engine not installed or disabled") @@ -104,6 +105,7 @@ def _is_py3_complex_incompat(result, expected): class TestEvalNumexprPandas(unittest.TestCase): + @classmethod def setUpClass(cls): skip_if_no_ne() @@ -201,7 +203,8 @@ def test_compound_invert_op(self): @slow def test_chained_cmp_op(self): mids = self.lhses - cmp_ops = '<', '>'# tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>='])) + # tuple(set(self.cmp_ops) - set(['==', '!=', '<=', '>='])) + cmp_ops = '<', '>' for lhs, cmp1, mid, cmp2, rhs in product(self.lhses, cmp_ops, mids, cmp_ops, self.rhses): self.check_chained_cmp_op(lhs, cmp1, mid, cmp2, rhs) @@ -231,7 +234,7 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): engine=self.engine, parser=self.parser) elif (np.isscalar(lhs) and np.isnan(lhs) and not np.isscalar(rhs) and (cmp1 in skip_these or cmp2 in - skip_these)): + skip_these)): with tm.assertRaises(TypeError): _eval_single_bin(lhs, binop, rhs, self.engine) else: @@ -243,19 +246,20 @@ def check_complex_cmp_op(self, lhs, cmp1, rhs, binop, cmp2): # TODO: the code below should be added back when left and right # hand side bool ops are fixed. - #try: - #self.assertRaises(Exception, pd.eval, ex, + # try: + # self.assertRaises(Exception, pd.eval, ex, #local_dict={'lhs': lhs, 'rhs': rhs}, - #engine=self.engine, parser=self.parser) - #except AssertionError: + # engine=self.engine, parser=self.parser) + # except AssertionError: #import ipdb; ipdb.set_trace() - #raise + # raise elif (np.isscalar(lhs_new) and np.isnan(lhs_new) and not np.isscalar(rhs_new) and binop in skip_these): with tm.assertRaises(TypeError): _eval_single_bin(lhs_new, binop, rhs_new, self.engine) else: - expected = _eval_single_bin(lhs_new, binop, rhs_new, self.engine) + expected = _eval_single_bin( + lhs_new, binop, rhs_new, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) assert_array_equal(result, expected) @@ -306,7 +310,7 @@ def check_operands(left, right, cmp_op): for ex in (ex1, ex2, ex3): result = pd.eval(ex, engine=self.engine, - parser=self.parser) + parser=self.parser) assert_array_equal(result, expected) @skip_incompatible_operand @@ -314,8 +318,8 @@ def check_simple_cmp_op(self, lhs, cmp1, rhs): ex = 'lhs {0} rhs'.format(cmp1) if cmp1 in ('in', 'not in') and not com.is_list_like(rhs): self.assertRaises(TypeError, pd.eval, ex, engine=self.engine, - parser=self.parser, local_dict={'lhs': lhs, - 'rhs': rhs}) + parser=self.parser, local_dict={'lhs': lhs, + 'rhs': rhs}) else: expected = _eval_single_bin(lhs, cmp1, rhs, self.engine) result = pd.eval(ex, engine=self.engine, parser=self.parser) @@ -396,7 +400,7 @@ def check_pow(self, lhs, arith1, rhs): result = pd.eval(ex, engine=self.engine, parser=self.parser) if (np.isscalar(lhs) and np.isscalar(rhs) and - _is_py3_complex_incompat(result, expected)): + _is_py3_complex_incompat(result, expected)): self.assertRaises(AssertionError, assert_array_equal, result, expected) else: @@ -462,9 +466,9 @@ def ex(self, op, var_name='lhs'): def test_frame_invert(self): expr = self.ex('~') - ## ~ ## + # ~ ## # frame - ## float always raises + # float always raises lhs = DataFrame(randn(5, 2)) if self.engine == 'numexpr': with tm.assertRaises(NotImplementedError): @@ -473,7 +477,7 @@ def test_frame_invert(self): with tm.assertRaises(TypeError): result = pd.eval(expr, engine=self.engine, parser=self.parser) - ## int raises on numexpr + # int raises on numexpr lhs = DataFrame(randint(5, size=(5, 2))) if self.engine == 'numexpr': with tm.assertRaises(NotImplementedError): @@ -483,13 +487,13 @@ def test_frame_invert(self): result = pd.eval(expr, engine=self.engine, parser=self.parser) assert_frame_equal(expect, result) - ## bool always works + # bool always works lhs = DataFrame(rand(5, 2) > 0.5) expect = ~lhs result = pd.eval(expr, engine=self.engine, parser=self.parser) assert_frame_equal(expect, result) - ## object raises + # object raises lhs = DataFrame({'b': ['a', 1, 2.0], 'c': rand(3) > 0.5}) if self.engine == 'numexpr': with tm.assertRaises(ValueError): @@ -499,11 +503,11 @@ def test_frame_invert(self): result = pd.eval(expr, engine=self.engine, parser=self.parser) def test_series_invert(self): - #### ~ #### + # ~ #### expr = self.ex('~') # series - ## float raises + # float raises lhs = Series(randn(5)) if self.engine == 'numexpr': with tm.assertRaises(NotImplementedError): @@ -512,7 +516,7 @@ def test_series_invert(self): with tm.assertRaises(TypeError): result = pd.eval(expr, engine=self.engine, parser=self.parser) - ## int raises on numexpr + # int raises on numexpr lhs = Series(randint(5, size=5)) if self.engine == 'numexpr': with tm.assertRaises(NotImplementedError): @@ -522,7 +526,7 @@ def test_series_invert(self): result = pd.eval(expr, engine=self.engine, parser=self.parser) assert_series_equal(expect, result) - ## bool + # bool lhs = Series(rand(5) > 0.5) expect = ~lhs result = pd.eval(expr, engine=self.engine, parser=self.parser) @@ -661,19 +665,30 @@ def test_scalar_unary(self): with tm.assertRaises(TypeError): pd.eval('~1.0', engine=self.engine, parser=self.parser) - self.assertEqual(pd.eval('-1.0', parser=self.parser, engine=self.engine), -1.0) - self.assertEqual(pd.eval('+1.0', parser=self.parser, engine=self.engine), +1.0) - - self.assertEqual(pd.eval('~1', parser=self.parser, engine=self.engine), ~1) - self.assertEqual(pd.eval('-1', parser=self.parser, engine=self.engine), -1) - self.assertEqual(pd.eval('+1', parser=self.parser, engine=self.engine), +1) - - self.assertEqual(pd.eval('~True', parser=self.parser, engine=self.engine), ~True) - self.assertEqual(pd.eval('~False', parser=self.parser, engine=self.engine), ~False) - self.assertEqual(pd.eval('-True', parser=self.parser, engine=self.engine), -True) - self.assertEqual(pd.eval('-False', parser=self.parser, engine=self.engine), -False) - self.assertEqual(pd.eval('+True', parser=self.parser, engine=self.engine), +True) - self.assertEqual(pd.eval('+False', parser=self.parser, engine=self.engine), +False) + self.assertEqual( + pd.eval('-1.0', parser=self.parser, engine=self.engine), -1.0) + self.assertEqual( + pd.eval('+1.0', parser=self.parser, engine=self.engine), +1.0) + + self.assertEqual( + pd.eval('~1', parser=self.parser, engine=self.engine), ~1) + self.assertEqual( + pd.eval('-1', parser=self.parser, engine=self.engine), -1) + self.assertEqual( + pd.eval('+1', parser=self.parser, engine=self.engine), +1) + + self.assertEqual( + pd.eval('~True', parser=self.parser, engine=self.engine), ~True) + self.assertEqual( + pd.eval('~False', parser=self.parser, engine=self.engine), ~False) + self.assertEqual( + pd.eval('-True', parser=self.parser, engine=self.engine), -True) + self.assertEqual( + pd.eval('-False', parser=self.parser, engine=self.engine), -False) + self.assertEqual( + pd.eval('+True', parser=self.parser, engine=self.engine), +True) + self.assertEqual( + pd.eval('+False', parser=self.parser, engine=self.engine), +False) def test_disallow_scalar_bool_ops(self): exprs = '1 or 2', '1 and 2' @@ -689,6 +704,7 @@ def test_disallow_scalar_bool_ops(self): class TestEvalNumexprPython(TestEvalNumexprPandas): + @classmethod def setUpClass(cls): skip_if_no_ne() @@ -714,6 +730,7 @@ def check_chained_cmp_op(self, lhs, cmp1, mid, cmp2, rhs): class TestEvalPythonPython(TestEvalNumexprPython): + @classmethod def setUpClass(cls): cls.engine = 'python' @@ -741,6 +758,7 @@ def check_alignment(self, result, nlhs, ghs, op): class TestEvalPythonPandas(TestEvalPythonPython): + @classmethod def setUpClass(cls): cls.engine = 'python' @@ -782,9 +800,9 @@ def check_basic_frame_alignment(self, engine, parser): self.index_types) for lr_idx_type, rr_idx_type, c_idx_type in args: df = mkdf(10, 10, data_gen_f=f, r_idx_type=lr_idx_type, - c_idx_type=c_idx_type) + c_idx_type=c_idx_type) df2 = mkdf(20, 10, data_gen_f=f, r_idx_type=rr_idx_type, - c_idx_type=c_idx_type) + c_idx_type=c_idx_type) res = pd.eval('df + df2', engine=engine, parser=parser) assert_frame_equal(res, df + df2) @@ -797,7 +815,7 @@ def check_frame_comparison(self, engine, parser): args = product(self.lhs_index_types, repeat=2) for r_idx_type, c_idx_type in args: df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + c_idx_type=c_idx_type) res = pd.eval('df < 2', engine=engine, parser=parser) assert_frame_equal(res, df < 2) @@ -829,9 +847,10 @@ def test_medium_complex_frame_alignment(self): def check_basic_frame_series_alignment(self, engine, parser): skip_if_no_ne(engine) + def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) @@ -856,9 +875,10 @@ def test_basic_frame_series_alignment(self): def check_basic_series_frame_alignment(self, engine, parser): skip_if_no_ne(engine) + def testit(r_idx_type, c_idx_type, index_name): df = mkdf(10, 7, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) @@ -873,12 +893,12 @@ def testit(r_idx_type, c_idx_type, index_name): assert_frame_equal(res, expected) # only test dt with dt, otherwise weird joins result - args = product(['i','u','s'],['i','u','s'],('index', 'columns')) + args = product(['i', 'u', 's'], ['i', 'u', 's'], ('index', 'columns')) for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) # dt with dt - args = product(['dt'],['dt'],('index', 'columns')) + args = product(['dt'], ['dt'], ('index', 'columns')) for r_idx_type, c_idx_type, index_name in args: testit(r_idx_type, c_idx_type, index_name) @@ -892,7 +912,7 @@ def check_series_frame_commutativity(self, engine, parser): ('index', 'columns')) for r_idx_type, c_idx_type, op, index_name in args: df = mkdf(10, 10, data_gen_f=f, r_idx_type=r_idx_type, - c_idx_type=c_idx_type) + c_idx_type=c_idx_type) index = getattr(df, index_name) s = Series(np.random.randn(5), index[:5]) @@ -1005,6 +1025,7 @@ def test_performance_warning_for_poor_alignment(self): # slightly more complex ops class TestOperationsNumExprPandas(unittest.TestCase): + @classmethod def setUpClass(cls): skip_if_no_ne() @@ -1175,21 +1196,22 @@ def test_assignment_column(self): # invalid assignees self.assertRaises(SyntaxError, df.eval, 'd,c = a + b') - self.assertRaises(SyntaxError, df.eval, 'Timestamp("20131001") = a + b') + self.assertRaises( + SyntaxError, df.eval, 'Timestamp("20131001") = a + b') # single assignment - existing variable expected = orig_df.copy() expected['a'] = expected['a'] + expected['b'] df = orig_df.copy() df.eval('a = a + b') - assert_frame_equal(df,expected) + assert_frame_equal(df, expected) # single assignment - new variable expected = orig_df.copy() expected['c'] = expected['a'] + expected['b'] df = orig_df.copy() df.eval('c = a + b') - assert_frame_equal(df,expected) + assert_frame_equal(df, expected) # with a local name overlap def f(): @@ -1201,9 +1223,10 @@ def f(): df = f() expected = orig_df.copy() expected['a'] = 1 + expected['b'] - assert_frame_equal(df,expected) + assert_frame_equal(df, expected) df = orig_df.copy() + def f(): a = 1 df.eval('a=a+b') @@ -1216,10 +1239,10 @@ def f(): # explicit targets df = orig_df.copy() - self.eval('c = df.a + df.b', local_dict={'df' : df}, target=df) + self.eval('c = df.a + df.b', local_dict={'df': df}, target=df) expected = orig_df.copy() expected['c'] = expected['a'] + expected['b'] - assert_frame_equal(df,expected) + assert_frame_equal(df, expected) def test_basic_period_index_boolean_expression(self): df = mkdf(2, 2, data_gen_f=f, c_idx_type='p', r_idx_type='i') @@ -1311,6 +1334,7 @@ def test_simple_in_ops(self): class TestOperationsNumExprPython(TestOperationsNumExprPandas): + @classmethod def setUpClass(cls): if not _USE_NUMEXPR: @@ -1377,6 +1401,7 @@ def test_simple_bool_ops(self): class TestOperationsPythonPython(TestOperationsNumExprPython): + @classmethod def setUpClass(cls): cls.engine = cls.parser = 'python' @@ -1386,6 +1411,7 @@ def setUpClass(cls): class TestOperationsPythonPandas(TestOperationsNumExprPandas): + @classmethod def setUpClass(cls): cls.engine = 'python' @@ -1397,6 +1423,7 @@ def setUpClass(cls): class TestScope(object): + def check_global_scope(self, e, engine, parser): skip_if_no_ne(engine) assert_array_equal(_var_s * 2, pd.eval(e, engine=engine, @@ -1478,6 +1505,7 @@ def test_is_expr_names(): _parsers = {'python': PythonExprVisitor, 'pytables': pytables.ExprVisitor, 'pandas': PandasExprVisitor} + def check_disallowed_nodes(engine, parser): skip_if_no_ne(engine) VisitorClass = _parsers[parser] diff --git a/pandas/core/array.py b/pandas/core/array.py index 209b00cf8bb3c..f267771bb770f 100644 --- a/pandas/core/array.py +++ b/pandas/core/array.py @@ -35,7 +35,7 @@ NA = np.nan -#### a series-like ndarray #### +# a series-like ndarray #### class SNDArray(Array): diff --git a/pandas/core/base.py b/pandas/core/base.py index a702e7c87c0a9..36c5a65163fad 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,6 +7,7 @@ class StringMixin(object): + """implements string methods so long as object defines a `__unicode__` method. @@ -55,6 +56,7 @@ def __repr__(self): class PandasObject(StringMixin): + """baseclass for various pandas objects""" @property @@ -96,6 +98,7 @@ def _reset_cache(self, key=None): class FrozenList(PandasObject, list): + """ Container that doesn't allow setting item *but* because it's technically non-hashable, will be used diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index fec9cd4ff4274..23fccc3719278 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -33,6 +33,7 @@ def f(self, other): class Categorical(PandasObject): + """ Represents a categorical variable in classic R / S-plus fashion @@ -74,6 +75,7 @@ class Categorical(PandasObject): array(['a', 'b', 'c', 'a', 'b', 'c'], dtype=object) Levels (3): Index(['a', 'b', 'c'], dtype=object) """ + def __init__(self, labels, levels=None, name=None): if levels is None: if name is None: @@ -148,14 +150,14 @@ def _tidy_repr(self, max_vals=20): footer=False) result = '%s\n...\n%s' % (head, tail) - #TODO: tidy_repr for footer since there may be a ton of levels? + # TODO: tidy_repr for footer since there may be a ton of levels? result = '%s\n%s' % (result, self._repr_footer()) return compat.text_type(result) def _repr_footer(self): levheader = 'Levels (%d): ' % len(self.levels) - #TODO: should max_line_width respect a setting? + # TODO: should max_line_width respect a setting? levstring = np.array_repr(self.levels, max_line_width=60) indent = ' ' * (levstring.find('[') + len(levheader) + 1) lines = levstring.split('\n') @@ -222,11 +224,11 @@ def describe(self): """ Returns a dataframe with frequency and counts by level. """ - #Hack? + # Hack? from pandas.core.frame import DataFrame grouped = DataFrame(self.labels).groupby(0) counts = grouped.count().values.squeeze() - freqs = counts/float(counts.sum()) + freqs = counts / float(counts.sum()) return DataFrame.from_dict({ 'counts': counts, 'freqs': freqs, diff --git a/pandas/core/common.py b/pandas/core/common.py index 6fc015d2cb575..d251a2617f98d 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -73,6 +73,7 @@ def _check(cls, inst): class _ABCGeneric(type): + def __instancecheck__(cls, inst): return hasattr(inst, "_data") @@ -962,7 +963,8 @@ def changeit(): # if we are trying to do something unsafe # like put a bigger dtype in a smaller one, use the smaller one - if change.dtype.itemsize < r.dtype.itemsize: # pragma: no cover + # pragma: no cover + if change.dtype.itemsize < r.dtype.itemsize: raise AssertionError( "cannot change dtype of input to smaller size") change.dtype = r.dtype @@ -2469,8 +2471,8 @@ def _pprint_dict(seq, _nest_lvl=0, **kwds): nitems = get_option("max_seq_items") or len(seq) for k, v in list(seq.items())[:nitems]: - pairs.append(pfmt % (pprint_thing(k, _nest_lvl+1, **kwds), - pprint_thing(v, _nest_lvl+1, **kwds))) + pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, **kwds), + pprint_thing(v, _nest_lvl + 1, **kwds))) if nitems < len(seq): return fmt % (", ".join(pairs) + ", ...") diff --git a/pandas/core/config.py b/pandas/core/config.py index 6eb947119578f..ac48232ec618f 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -66,11 +66,12 @@ class OptionError(AttributeError, KeyError): + """Exception for pandas.options, backwards compatible with KeyError checks""" -########################################## +# # User API def _get_single_key(pat, silent): @@ -187,8 +188,10 @@ def get_default_val(pat): class DictWrapper(object): + """ provide attribute-style access to a nested dict """ + def __init__(self, d, prefix=""): object.__setattr__(self, "d", d) object.__setattr__(self, "prefix", prefix) @@ -354,11 +357,12 @@ def __doc__(self): describe_option = CallableDynamicDoc(_describe_option, _describe_option_tmpl) options = DictWrapper(_global_config) -###################################################### +# # Functions for use by pandas developers, in addition to User - api class option_context(object): + def __init__(self, *args): if not (len(args) % 2 == 0 and len(args) >= 2): raise AssertionError( @@ -499,7 +503,7 @@ def deprecate_option(key, msg=None, rkey=None, removal_ver=None): _deprecated_options[key] = DeprecatedOption(key, msg, rkey, removal_ver) -################################ +# # functions internal to the module def _select_options(pat): @@ -662,7 +666,7 @@ def pp(name, ks): return s -############## +# # helpers from contextlib import contextmanager diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index b9b934769793f..5502dc94e24c1 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -17,7 +17,7 @@ from pandas.core.format import detect_console_encoding -########################################### +# # options from the "display" namespace pc_precision_doc = """ diff --git a/pandas/core/daterange.py b/pandas/core/daterange.py index 9ddd76c471d44..bdaf546789c39 100644 --- a/pandas/core/daterange.py +++ b/pandas/core/daterange.py @@ -9,6 +9,7 @@ # DateRange class class DateRange(Index): + """Deprecated """ diff --git a/pandas/core/format.py b/pandas/core/format.py index 9abfe3c43b8e5..7354600c78c67 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -64,6 +64,7 @@ class CategoricalFormatter(object): + def __init__(self, categorical, buf=None, length=True, na_rep='NaN', name=False, footer=True): self.categorical = categorical @@ -246,6 +247,7 @@ def _get_formatter(self, i): class DataFrameFormatter(TableFormatter): + """ Render a DataFrame @@ -886,7 +888,7 @@ def __init__(self, obj, path_or_buf, sep=",", na_rep='', float_format=None, self.date_format = date_format - #GH3457 + # GH3457 if not self.obj.columns.is_unique and engine == 'python': raise NotImplementedError("columns.is_unique == False not " "supported with engine='python'") @@ -1155,7 +1157,7 @@ def _save_header(self): col_line.append(columns.names[i]) if isinstance(index_label, list) and len(index_label) > 1: - col_line.extend([''] * (len(index_label)-1)) + col_line.extend([''] * (len(index_label) - 1)) col_line.extend(columns.get_level_values(i)) @@ -1176,7 +1178,7 @@ def _save(self): # write in chunksize bites chunksize = self.chunksize - chunks = int(nrows / chunksize)+1 + chunks = int(nrows / chunksize) + 1 for i in range(chunks): start_i = i * chunksize @@ -1237,6 +1239,7 @@ def __init__(self, row, col, val, class ExcelFormatter(object): + """ Class for formatting a DataFrame to a list of ExcelCells, @@ -1591,6 +1594,7 @@ def _format(x): class FloatArrayFormatter(GenericArrayFormatter): + """ """ @@ -1860,6 +1864,7 @@ def get_console_size(): class EngFormatter(object): + """ Formats float values according to engineering format. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b194c938b13cc..5e617671f5c49 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -766,9 +766,9 @@ def from_records(cls, data, index=None, exclude=None, columns=None, values = [first_row] - #if unknown length iterable (generator) + # if unknown length iterable (generator) if nrows is None: - #consume whole generator + # consume whole generator values += list(data) else: i = 1 @@ -1592,7 +1592,7 @@ def _ixs(self, i, axis=0, copy=False): # a numpy error (as numpy should really raise) values = self._data.iget(i) if not len(values): - values = np.array([np.nan]*len(self.index), dtype=object) + values = np.array([np.nan] * len(self.index), dtype=object) return self._constructor_sliced.from_array( values, index=self.index, name=label, fastpath=True) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 18f41917067f2..cfe3220102d54 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -52,7 +52,6 @@ _apply_whitelist = frozenset(['last', 'first', 'mean', 'sum', 'min', 'max', - 'head', 'tail', 'cumsum', 'cumprod', 'cummin', 'cummax', 'resample', 'describe', @@ -120,6 +119,7 @@ def _last(x): class GroupBy(PandasObject): + """ Class for grouping and aggregating relational data. See aggregate, transform, and apply functions on this object. @@ -185,6 +185,7 @@ class GroupBy(PandasObject): len(grouped) : int Number of groups """ + def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, sort=True, group_keys=True, squeeze=False): @@ -480,8 +481,9 @@ def picker(arr): return np.nan return self.agg(picker) - def cumcount(self): - """Number each item in each group from 0 to the length of that group. + def cumcount(self, **kwargs): + ''' + Number each item in each group from 0 to the length of that group. Essentially this is equivalent to @@ -509,13 +511,101 @@ def cumcount(self): 5 3 dtype: int64 - """ + ''' + ascending = kwargs.pop('ascending', True) + index = self.obj.index - cumcounts = np.zeros(len(index), dtype='int64') - for v in self.indices.values(): - cumcounts[v] = np.arange(len(v), dtype='int64') + rng = np.arange(self.grouper._max_groupsize, dtype='int64') + cumcounts = self._cumcount_array(rng, ascending=ascending) return Series(cumcounts, index) + def head(self, n=5): + ''' + Returns first n rows of each group. + + Essentially equivalent to .apply(lambda x: x.head(n)) + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).head(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + A + 1 0 1 2 + 5 2 5 6 + + ''' + rng = np.arange(self.grouper._max_groupsize, dtype='int64') + in_head = self._cumcount_array(rng) < n + head = self.obj[in_head] + if self.as_index: + head.index = self._index_with_as_index(in_head) + return head + + def tail(self, n=5): + ''' + Returns first n rows of each group + + Essentially equivalent to .apply(lambda x: x.tail(n)) + + Example + ------- + + >>> df = DataFrame([[1, 2], [1, 4], [5, 6]], + columns=['A', 'B']) + >>> df.groupby('A', as_index=False).tail(1) + A B + 0 1 2 + 2 5 6 + >>> df.groupby('A').head(1) + A B + A + 1 0 1 2 + 5 2 5 6 + ''' + rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') + in_tail = self._cumcount_array(rng, ascending=False) > -n + tail = self.obj[in_tail] + if self.as_index: + tail.index = self._index_with_as_index(in_tail) + return tail + + def _cumcount_array(self, arr, **kwargs): + ascending = kwargs.pop('ascending', True) + + len_index = len(self.obj.index) + cumcounts = np.zeros(len_index, dtype='int64') + if ascending: + for v in self.indices.values(): + cumcounts[v] = arr[:len(v)] + else: + for v in self.indices.values(): + cumcounts[v] = arr[len(v)-1::-1] + return cumcounts + + def _index_with_as_index(self, b): + ''' + Take boolean mask of index to be returned from apply, if as_index=True + + ''' + # TODO perf, it feels like this should already be somewhere... + from itertools import chain + original = self.obj.index + gp = self.grouper + levels = chain((gp.levels[i][gp.labels[i][b]] + for i in range(len(gp.groupings))), + (original.get_level_values(i)[b] + for i in range(original.nlevels))) + new = MultiIndex.from_arrays(list(levels)) + new.names = gp.names + original.names + return new + def _try_cast(self, result, obj): """ try to cast the result to our obj original type, @@ -653,9 +743,11 @@ def _is_indexed_like(obj, axes): class Grouper(object): + """ """ + def __init__(self, axis, groupings, sort=True, group_keys=True): self.axis = axis self.groupings = groupings @@ -754,14 +846,28 @@ def names(self): def size(self): """ Compute group sizes + """ # TODO: better impl labels, _, ngroups = self.group_info - bin_counts = Series(labels).value_counts() + bin_counts = algos.value_counts(labels, sort=False) bin_counts = bin_counts.reindex(np.arange(ngroups)) bin_counts.index = self.result_index return bin_counts + @cache_readonly + def _max_groupsize(self): + ''' + Compute size of largest group + + ''' + # For many items in each group this is much faster than + # self.size().max(), in worst case marginally slower + if self.indices: + return max(len(v) for v in self.indices.itervalues()) + else: + return 0 + @cache_readonly def groups(self): if len(self.groupings) == 1: @@ -1209,6 +1315,7 @@ def agg_series(self, obj, func): class Grouping(object): + """ Holds the grouping information for a single key @@ -1229,6 +1336,7 @@ class Grouping(object): * group_index : unique groups * groups : dict of {group -> label_list} """ + def __init__(self, index, grouper=None, name=None, level=None, sort=True): @@ -1594,7 +1702,7 @@ def _get_index(): return index if isinstance(values[0], dict): - # # GH #823 + # GH #823 index = _get_index() return DataFrame(values, index=index).stack() @@ -2522,6 +2630,7 @@ def _chop(self, sdata, slice_obj): class FrameSplitter(DataSplitter): + def __init__(self, data, labels, ngroups, axis=0): super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis) @@ -2546,6 +2655,7 @@ def _chop(self, sdata, slice_obj): class NDFrameSplitter(DataSplitter): + def __init__(self, data, labels, ngroups, axis=0): super(NDFrameSplitter, self).__init__(data, labels, ngroups, axis=axis) @@ -2679,9 +2789,11 @@ def _lexsort_indexer(keys, orders=None): class _KeyMapper(object): + """ Ease my suffering. Map compressed group id -> key tuple """ + def __init__(self, comp_ids, ngroups, labels, levels): self.levels = levels self.labels = labels diff --git a/pandas/core/index.py b/pandas/core/index.py index 65eb8486c36d2..18d6a1a04e3f9 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1726,6 +1726,7 @@ def _wrap_joined_index(self, joined, other): class Float64Index(Index): + """ Immutable ndarray implementing an ordered, sliceable set. The basic object storing axis labels for all pandas objects. Float64Index is a special case diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ab9000fd21a0a..6d0f57c6ddd57 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -22,7 +22,7 @@ def get_indexers_list(): ('loc', _LocIndexer), ('at', _AtIndexer), ('iat', _iAtIndexer), - ] + ] # "null slice" _NS = slice(None, None) @@ -116,13 +116,13 @@ def _convert_tuple(self, key, is_setter=False): def _convert_scalar_indexer(self, key, axis): # if we are accessing via lowered dim, use the last dim - ax = self.obj._get_axis(min(axis, self.ndim-1)) + ax = self.obj._get_axis(min(axis, self.ndim - 1)) # a scalar return ax._convert_scalar_indexer(key, typ=self.name) def _convert_slice_indexer(self, key, axis): # if we are accessing via lowered dim, use the last dim - ax = self.obj._get_axis(min(axis, self.ndim-1)) + ax = self.obj._get_axis(min(axis, self.ndim - 1)) return ax._convert_slice_indexer(key, typ=self.name) def _has_valid_setitem_indexer(self, indexer): @@ -494,7 +494,7 @@ def _align_series(self, indexer, ser): # broadcast along other dims ser = ser.values.copy() for (axis, l) in broadcast: - shape = [-1] * (len(broadcast)+1) + shape = [-1] * (len(broadcast) + 1) shape[axis] = l ser = np.tile(ser, l).reshape(shape) @@ -820,7 +820,7 @@ def _reindex(keys, level=None): # reindex with the specified axis ndim = self.obj.ndim - if axis+1 > ndim: + if axis + 1 > ndim: raise AssertionError("invalid indexing error with " "non-unique index") @@ -980,6 +980,7 @@ def _get_slice_axis(self, slice_obj, axis=0): class _IXIndexer(_NDFrameIndexer): + """ A primarily location based indexer, with integer fallback """ def _has_valid_type(self, key, axis): @@ -1039,6 +1040,7 @@ def _get_slice_axis(self, slice_obj, axis=0): class _LocIndexer(_LocationIndexer): + """ purely label based location based indexing """ _valid_types = ("labels (MUST BE IN THE INDEX), slices of labels (BOTH " "endpoints included! Can be slices of integers if the " @@ -1136,6 +1138,7 @@ def _getitem_axis(self, key, axis=0): class _iLocIndexer(_LocationIndexer): + """ purely integer based location based indexing """ _valid_types = ("integer, integer slice (START point is INCLUDED, END " "point is EXCLUDED), listlike of integers, boolean array") @@ -1228,6 +1231,7 @@ def _convert_to_indexer(self, obj, axis=0, is_setter=False): class _ScalarAccessIndexer(_NDFrameIndexer): + """ access scalars quickly """ def _convert_key(self, key): @@ -1257,11 +1261,13 @@ def __setitem__(self, key, value): class _AtIndexer(_ScalarAccessIndexer): + """ label based scalar accessor """ pass class _iAtIndexer(_ScalarAccessIndexer): + """ integer based scalar accessor """ def _has_valid_setitem_indexer(self, indexer): @@ -1301,7 +1307,7 @@ def _length_of_indexer(indexer, target=None): step = 1 elif step < 0: step = abs(step) - return (stop-start) / step + return (stop - start) / step elif isinstance(indexer, (ABCSeries, np.ndarray, list)): return len(indexer) elif not is_list_like(indexer): @@ -1346,6 +1352,7 @@ def _crit(v): class _SeriesIndexer(_IXIndexer): + """ Class to support fancy indexing, potentially using labels @@ -1504,11 +1511,11 @@ def _check_slice_bounds(slobj, values): l = len(values) start = slobj.start if start is not None: - if start < -l or start > l-1: + if start < -l or start > l - 1: raise IndexError("out-of-bounds on slice (start)") stop = slobj.stop if stop is not None: - if stop < -l-1 or stop > l: + if stop < -l - 1 or stop > l: raise IndexError("out-of-bounds on slice (end)") diff --git a/pandas/core/internals.py b/pandas/core/internals.py index bb719722fd090..44a18ef3043b3 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -533,7 +533,7 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): values[mask] = na_rep return values.tolist() - #### block actions #### + # block actions #### def copy(self, deep=True, ref_items=None): values = self.values if deep: @@ -667,7 +667,7 @@ def putmask(self, mask, new, align=True, inplace=False): new = self._try_cast(new) # pseudo-broadcast - if isinstance(new, np.ndarray) and new.ndim == self.ndim-1: + if isinstance(new, np.ndarray) and new.ndim == self.ndim - 1: new = np.repeat(new, self.shape[-1]).reshape(self.shape) np.putmask(new_values, mask, new) @@ -1026,7 +1026,7 @@ def where(self, other, cond, align=True, raise_on_error=True, # pseodo broadcast (its a 2d vs 1d say and where needs it in a # specific direction) - if (other.ndim >= 1 and values.ndim-1 == other.ndim and + if (other.ndim >= 1 and values.ndim - 1 == other.ndim and values.shape[0] != other.shape[0]): other = _block_shape(other).T else: @@ -1201,7 +1201,7 @@ def _try_fill(self, value): pass elif com.is_integer(value): # coerce to seconds of timedelta - value = np.timedelta64(int(value*1e9)) + value = np.timedelta64(int(value * 1e9)) elif isinstance(value, timedelta): value = np.timedelta64(value) @@ -3035,8 +3035,8 @@ def _add_new_block(self, item, value, loc=None): # need to shift elements to the right if self._ref_locs[loc] is not None: - for i in reversed(lrange(loc+1, len(self._ref_locs))): - self._ref_locs[i] = self._ref_locs[i-1] + for i in reversed(lrange(loc + 1, len(self._ref_locs))): + self._ref_locs[i] = self._ref_locs[i - 1] self._ref_locs[loc] = (new_block, 0) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b6ebeb7f96489..3b7320b848f9b 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -23,6 +23,7 @@ class disallow(object): + def __init__(self, *dtypes): super(disallow, self).__init__() self.dtypes = tuple(np.dtype(dtype).type for dtype in dtypes) @@ -44,6 +45,7 @@ def _f(*args, **kwargs): class bottleneck_switch(object): + def __init__(self, zero_value=None, **kwargs): self.zero_value = zero_value self.kwargs = kwargs diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 0836ac7bc22a6..b8f988e38f14b 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -230,6 +230,7 @@ def add_flex_arithmetic_methods(cls, flex_arith_method, radd_func=None, class _TimeOp(object): + """ Wrapper around Series datetime/time/timedelta arithmetic operations. Generally, you should use classmethod ``maybe_convert_for_time_op`` as an diff --git a/pandas/core/panelnd.py b/pandas/core/panelnd.py index 8ac84c0d91adc..a7cfe49484d24 100644 --- a/pandas/core/panelnd.py +++ b/pandas/core/panelnd.py @@ -44,7 +44,7 @@ def create_nd_panel_factory(klass_name, orders, slices, slicer, aliases=None, klass._constructor_sliced = slicer - #### define the methods #### + # define the methods #### def __init__(self, *args, **kwargs): if not (kwargs.get('data') or len(args)): raise Exception( diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 24a4797759dab..1178a853c6619 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -22,6 +22,7 @@ class _Unstacker(object): + """ Helper class to unstack data / pivot with multi-level index @@ -57,6 +58,7 @@ class _Unstacker(object): ------- unstacked : DataFrame """ + def __init__(self, values, index, level=-1, value_columns=None): if values.ndim == 1: values = values[:, np.newaxis] diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 0df9db2ebd06c..12f21df9e7c0e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -405,7 +405,7 @@ def f(x): else: return None else: - empty_row = Series(regex.groups*[None]) + empty_row = Series(regex.groups * [None]) def f(x): if not isinstance(x, compat.string_types): @@ -743,6 +743,7 @@ def do_copy(target): class StringMethods(object): + """ Vectorized string functions for Series. NAs stay NA unless handled otherwise by a particular method. Patterned after Python's string methods, @@ -753,6 +754,7 @@ class StringMethods(object): >>> s.str.split('_') >>> s.str.replace('_', '') """ + def __init__(self, series): self.series = series diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 9df5541615cee..010a65738caa0 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1203,24 +1203,53 @@ def test_groupby_as_index_apply(self): g_not_as = df.groupby('user_id', as_index=False) res_as = g_as.head(2).index - exp_as = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + exp_as = MultiIndex.from_tuples([(1, 0), (2, 1), (1, 2), (3, 4)]) assert_index_equal(res_as, exp_as) res_not_as = g_not_as.head(2).index - exp_not_as = Index([0, 2, 1, 4]) + exp_not_as = Index([0, 1, 2, 4]) assert_index_equal(res_not_as, exp_not_as) - res_as = g_as.apply(lambda x: x.head(2)).index - assert_index_equal(res_not_as, exp_not_as) + res_as_apply = g_as.apply(lambda x: x.head(2)).index + res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index - res_not_as = g_not_as.apply(lambda x: x.head(2)).index - assert_index_equal(res_not_as, exp_not_as) + # apply doesn't maintain the original ordering + exp_not_as_apply = Index([0, 2, 1, 4]) + exp_as_apply = MultiIndex.from_tuples([(1, 0), (1, 2), (2, 1), (3, 4)]) + + assert_index_equal(res_as_apply, exp_as_apply) + assert_index_equal(res_not_as_apply, exp_not_as_apply) ind = Index(list('abcde')) df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind) res = df.groupby(0, as_index=False).apply(lambda x: x).index assert_index_equal(res, ind) + def test_groupby_head_tail(self): + df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + g_as = df.groupby('A', as_index=True) + g_not_as = df.groupby('A', as_index=False) + + # as_index= False much easier + exp_head_not_as = df.loc[[0, 2]] + res_head_not_as = g_not_as.head(1) + assert_frame_equal(exp_head_not_as, res_head_not_as) + exp_tail_not_as = df.loc[[1, 2]] + res_tail_not_as = g_not_as.tail(1) + assert_frame_equal(exp_tail_not_as, res_tail_not_as) + + # as_index=True, yuck + res_head_as = g_as.head(1) + res_tail_as = g_as.tail(1) + + # prepend the A column as an index, in a roundabout way + df.index = df.set_index('A', append=True, drop=False).index.swaplevel(0, 1) + exp_head_as = df.loc[[0, 2]] + exp_tail_as = df.loc[[1, 2]] + + assert_frame_equal(exp_head_as, res_head_as) + assert_frame_equal(exp_tail_as, res_tail_as) + def test_groupby_multiple_key(self): df = tm.makeTimeDataFrame() grouped = df.groupby([lambda x: x.year,