Snippet:Reading GIS shape file, cleaning data and performing various statistics

From HUES Platform Wiki
Description Reads shape file from GIS, saves it as HDF5 file and performs statistics using pandas package.
Author(s)
Language Python
Tags python, GIS, Swiss building db
Snippet code ##########SCRIPT#########
import pandas as pd
import numpy as np
import seaborn as sns
import pysal as ps
import matplotlib as plt
import scipy as scipy


# Function to read ArcGIS db into a pandas data frame
# From fuction author: 
#     Read a dbf file as a pandas.DataFrame, optionally selecting the index
#     variable and which columns are to be loaded.
#     __author__  = "Dani Arribas-Bel <darribas@asu.edu> "
#     ...
#     Arguments
#     ---------
#     dbf_path    : str
#                   Path to the DBF file to be read
#     index       : str
#                   Name of the column to be used as the index of the DataFrame
#     cols        : list
#                   List with the names of the columns to be read into the
#                   DataFrame. Defaults to False, which reads the whole dbf
#     incl_index  : Boolean
#                   If True index is included in the DataFrame as a
#                   column too. Defaults to False
#     Returns
#     -------
#     df          : DataFrame
#                   pandas.DataFrame object created


def dbf2df(dbf_path, index=None, cols=False, incl_index=False):
 db = ps.open(dbf_path)
 if cols:
  if incl_index:
   cols.append(index)
  vars_to_read = cols
 else:
  vars_to_read = db.header
 data = dict([(var, db.by_col(var)) for var in vars_to_read])
 if index:
  index = db.by_col(index)
  db.close()
  return pd.DataFrame(data, index=index)
 else:
  db.close()
  return pd.DataFrame(data)


# Import column names from ArcGIS


colnames=[u'CH_CODE_HN', u'FLAECHE', u'LUID', u'MEAN_HEIGH', u'STD_HEIGHT', u'MIN_HEIGHT', u'COUNT_HEIG', u'MEAN_Shape', u'MIN_Shape_', u'SUM_Shape_', u'MEAN_AR_12', u'MIN_AR_12', u'BCR', u'FAR', u'B_per_A', u'SUM_FloorA', u'MEAN_Floor', u'MIN_FloorA', u'COUNT_Floo', u'MEAN_GASTW', u'SUM_GASTWS', u'STD_GASTWS', u'MEAN_ResiD', u'STD_ResiDe', u'COUNT_Resi', u'MEAN_NonRe', u'STD_NonRes', u'COUNT_NonR', u'Shape_Leng']
LandUse= dbf2df(r'C:\Users\research\Desktop\New sprint day wed\CH_V6\LandUse.dbf',cols=colnames)


# Print the dataframe to an hdf5 file


hdf=pd.HDFStore(r'C:\Users\research\Desktop\New sprint day wed\CH_V6\LandUse_V8.h5')
hdf.put('GIS',LandUse,format='Table',data_columns=True)
hdf.close()


# Read hdf5 files for buildings and land use data


buildings=pd.read_hdf(r'C:\Users\research\Desktop\New sprint day wed\CH_V2\Buildings.h5','GIS')
zone=pd.read_hdf(r'C:\Users\research\Desktop\New sprint day wed\CH_V6\LandUse_V8.h5','GIS')


# Remove all rows that contain -999 (ArcGIS for no information) with NaN (null values)


zone=zone.replace(-999,np.nan)
buildings=buildings.replace(-999,np.nan)
buildings


# Remove all rows where GAPTO is null


buildings_no_nan = buildings[buildings['GAPTO'].notnull()]
negative_buildings=buildings_no_nan[buildings_no_nan<=0].count()
negative_buildings


# Remove buildings without data on the number of people in the building and also remove buildings that have a height less than 2m (unfeasible)
# Separate database into different zone types (residential, offices, etc.) for both landuse data and buildings data


type_11z=zone[zone.CH_CODE_HN==11]
type_12z=zone[zone.CH_CODE_HN==12]
type_13z=zone[zone.CH_CODE_HN==13]
type_14z=zone[zone.CH_CODE_HN==14]
type_15z=zone[zone.CH_CODE_HN==15]
type_16z=zone[zone.CH_CODE_HN==16]
type_17z=zone[zone.CH_CODE_HN==17]


type_11b=buildings[buildings.CH_CODE_HN==11]
type_12b=buildings[buildings.CH_CODE_HN==12]
type_13b=buildings[buildings.CH_CODE_HN==13]
type_14b=buildings[buildings.CH_CODE_HN==14]
type_15b=buildings[buildings.CH_CODE_HN==15]
type_16b=buildings[buildings.CH_CODE_HN==16]
type_17b=buildings[buildings.CH_CODE_HN==17]


# Calculate statistics for each zone type


zone_11_description=type_11z.describe()
zone_11_description


zone_12_description=type_12z.describe()
zone_12_description


zone_13_description=type_13z.describe()
zone_13_description


zone_14_description=type_14z.describe()
zone_14_description


zone_15_description=type_15z.describe()
zone_15_description


type_16_description=type_16z.describe()
type_16_description


type_17_description=type_17z.describe()
type_17_description
Notes