{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "53755bb3", "metadata": {}, "outputs": [], "source": [ "from delta import *\n", "import pyspark\n", "\n", "builder = pyspark.sql.SparkSession.builder.appName(\"MyApp\") \\\n", " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n", " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", "\n", "spark = configure_spark_with_delta_pip(builder).getOrCreate()" ] }, { "cell_type": "code", "execution_count": 2, "id": "a9f65e85", "metadata": {}, "outputs": [], "source": [ "df1=spark.read.json('/home/celine/ipython-in-depth/btd2.json')" ] }, { "cell_type": "code", "execution_count": 3, "id": "73570cd7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "root\n", " |-- image: struct (nullable = true)\n", " | |-- origin: string (nullable = true)\n", " | |-- height: integer (nullable = true)\n", " | |-- width: integer (nullable = true)\n", " | |-- nChannels: integer (nullable = true)\n", " | |-- mode: integer (nullable = true)\n", " | |-- data: binary (nullable = true)\n", "\n" ] } ], "source": [ "image1=spark.read.format(\"image\") \\\n", " .option(\"recursiveFileLookup\", \"true\") \\\n", " .option(\"pathGlobFilter\", \"*.png\") \\\n", " .load(\"/home/celine/Bilder/zwischendatein/pc\")\n", "image2=spark.read.format(\"image\") \\\n", " .option(\"recursiveFileLookup\", \"true\") \\\n", " .option(\"pathGlobFilter\", \"*.png\") \\\n", " .load(\"/home/celine/Bilder/zwischendatein/p\")\n", "image1.printSchema()" ] }, { "cell_type": "code", "execution_count": 4, "id": "1ff444a6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Bike #DurationEnd DateEnd StationEnd TerminalStart DateStart StationStart TerminalSubscription TypeTrip IDZip Code
0520638/29/13 14:14South Van Ness at Market668/29/13 14:13South Van Ness at Market66Subscriber457694127
1661708/29/13 14:43San Jose City Hall108/29/13 14:42San Jose City Hall10Subscriber460795138
248718/29/13 10:17Mountain View City Hall278/29/13 10:16Mountain View City Hall27Subscriber413097214
326778/29/13 11:30San Jose City Hall108/29/13 11:29San Jose City Hall10Subscriber425195060
4319838/29/13 12:04Market at 10th678/29/13 12:02South Van Ness at Market66Subscriber429994103
....................................
1440104833852/28/14 22:22South Van Ness at Market662/28/14 22:15Powell Street BART53Subscriber19877194404
1440114251452/28/14 22:40Davis at Jackson422/28/14 22:38Commercial at Montgomery45Subscriber19877294111
1440124386772/28/14 22:56Market at 4th762/28/14 22:45Embarcadero at Sansome60Subscriber19877394102
144013414641283/1/14 16:50Harry Bridges Plaza (Ferry Building)502/28/14 23:01Civic Center BART (7th at Market)72Customer19877494124
1440145775702/28/14 23:30Townsend at 7th652/28/14 23:202nd at South Park64Subscriber19877594107
\n", "

144015 rows × 11 columns

\n", "
" ], "text/plain": [ " Bike # Duration End Date End Station \\\n", "0 520 63 8/29/13 14:14 South Van Ness at Market \n", "1 661 70 8/29/13 14:43 San Jose City Hall \n", "2 48 71 8/29/13 10:17 Mountain View City Hall \n", "3 26 77 8/29/13 11:30 San Jose City Hall \n", "4 319 83 8/29/13 12:04 Market at 10th \n", "... ... ... ... ... \n", "144010 483 385 2/28/14 22:22 South Van Ness at Market \n", "144011 425 145 2/28/14 22:40 Davis at Jackson \n", "144012 438 677 2/28/14 22:56 Market at 4th \n", "144013 414 64128 3/1/14 16:50 Harry Bridges Plaza (Ferry Building) \n", "144014 577 570 2/28/14 23:30 Townsend at 7th \n", "\n", " End Terminal Start Date Start Station \\\n", "0 66 8/29/13 14:13 South Van Ness at Market \n", "1 10 8/29/13 14:42 San Jose City Hall \n", "2 27 8/29/13 10:16 Mountain View City Hall \n", "3 10 8/29/13 11:29 San Jose City Hall \n", "4 67 8/29/13 12:02 South Van Ness at Market \n", "... ... ... ... \n", "144010 66 2/28/14 22:15 Powell Street BART \n", "144011 42 2/28/14 22:38 Commercial at Montgomery \n", "144012 76 2/28/14 22:45 Embarcadero at Sansome \n", "144013 50 2/28/14 23:01 Civic Center BART (7th at Market) \n", "144014 65 2/28/14 23:20 2nd at South Park \n", "\n", " Start Terminal Subscription Type Trip ID Zip Code \n", "0 66 Subscriber 4576 94127 \n", "1 10 Subscriber 4607 95138 \n", "2 27 Subscriber 4130 97214 \n", "3 10 Subscriber 4251 95060 \n", "4 66 Subscriber 4299 94103 \n", "... ... ... ... ... \n", "144010 53 Subscriber 198771 94404 \n", "144011 45 Subscriber 198772 94111 \n", "144012 60 Subscriber 198773 94102 \n", "144013 72 Customer 198774 94124 \n", "144014 64 Subscriber 198775 94107 \n", "\n", "[144015 rows x 11 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.toPandas()" ] }, { "cell_type": "code", "execution_count": 5, "id": "fdc549b4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
originheightwidthnChannelsmodedata
0file:///home/celine/Bilder/zwischendatein/pc/e...17532480424[158, 106, 82, 165, 158, 106, 82, 165, 158, 10...
1file:///home/celine/Bilder/zwischendatein/pc/e...17532480424[179, 239, 255, 4, 179, 239, 255, 4, 179, 239,...
2file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
7file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9file:///home/celine/Bilder/zwischendatein/pc/e...17532480424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
10file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
11file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
12file:///home/celine/Bilder/zwischendatein/pc/e...17532480424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
13file:///home/celine/Bilder/zwischendatein/pc/e...24801753424[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
14file:///home/celine/Bilder/zwischendatein/pc/e...17532480316[148, 95, 74, 148, 95, 74, 148, 95, 74, 148, 9...
15file:///home/celine/Bilder/zwischendatein/pc/e...24801753316[227, 249, 255, 227, 249, 255, 227, 249, 255, ...
\n", "
" ], "text/plain": [ " origin height width \\\n", "0 file:///home/celine/Bilder/zwischendatein/pc/e... 1753 2480 \n", "1 file:///home/celine/Bilder/zwischendatein/pc/e... 1753 2480 \n", "2 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "3 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "4 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "5 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "6 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "7 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "8 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "9 file:///home/celine/Bilder/zwischendatein/pc/e... 1753 2480 \n", "10 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "11 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "12 file:///home/celine/Bilder/zwischendatein/pc/e... 1753 2480 \n", "13 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "14 file:///home/celine/Bilder/zwischendatein/pc/e... 1753 2480 \n", "15 file:///home/celine/Bilder/zwischendatein/pc/e... 2480 1753 \n", "\n", " nChannels mode data \n", "0 4 24 [158, 106, 82, 165, 158, 106, 82, 165, 158, 10... \n", "1 4 24 [179, 239, 255, 4, 179, 239, 255, 4, 179, 239,... \n", "2 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "3 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "4 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "5 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "6 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "7 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "8 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "9 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "10 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "11 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "12 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "13 4 24 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... \n", "14 3 16 [148, 95, 74, 148, 95, 74, 148, 95, 74, 148, 9... \n", "15 3 16 [227, 249, 255, 227, 249, 255, 227, 249, 255, ... " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "image1.select(\"image.*\").toPandas()" ] }, { "cell_type": "code", "execution_count": 6, "id": "20612f18", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['__class__',\n", " '__delattr__',\n", " '__dict__',\n", " '__dir__',\n", " '__doc__',\n", " '__eq__',\n", " '__format__',\n", " '__ge__',\n", " '__getattr__',\n", " '__getattribute__',\n", " '__getitem__',\n", " '__gt__',\n", " '__hash__',\n", " '__init__',\n", " '__init_subclass__',\n", " '__le__',\n", " '__lt__',\n", " '__module__',\n", " '__ne__',\n", " '__new__',\n", " '__reduce__',\n", " '__reduce_ex__',\n", " '__repr__',\n", " '__setattr__',\n", " '__sizeof__',\n", " '__str__',\n", " '__subclasshook__',\n", " '__weakref__',\n", " '_collect_as_arrow',\n", " '_jcols',\n", " '_jdf',\n", " '_jmap',\n", " '_joinAsOf',\n", " '_jseq',\n", " '_lazy_rdd',\n", " '_repr_html_',\n", " '_sc',\n", " '_schema',\n", " '_session',\n", " '_sort_cols',\n", " '_sql_ctx',\n", " '_support_repr_html',\n", " '_to_corrected_pandas_type',\n", " 'agg',\n", " 'alias',\n", " 'approxQuantile',\n", " 'cache',\n", " 'checkpoint',\n", " 'coalesce',\n", " 'colRegex',\n", " 'collect',\n", " 'columns',\n", " 'corr',\n", " 'count',\n", " 'cov',\n", " 'createGlobalTempView',\n", " 'createOrReplaceGlobalTempView',\n", " 'createOrReplaceTempView',\n", " 'createTempView',\n", " 'crossJoin',\n", " 'crosstab',\n", " 'cube',\n", " 'describe',\n", " 'distinct',\n", " 'drop',\n", " 'dropDuplicates',\n", " 'drop_duplicates',\n", " 'dropna',\n", " 'dtypes',\n", " 'exceptAll',\n", " 'explain',\n", " 'fillna',\n", " 'filter',\n", " 'first',\n", " 'foreach',\n", " 'foreachPartition',\n", " 'freqItems',\n", " 'groupBy',\n", " 'groupby',\n", " 'head',\n", " 'hint',\n", " 'inputFiles',\n", " 'intersect',\n", " 'intersectAll',\n", " 'isEmpty',\n", " 'isLocal',\n", " 'isStreaming',\n", " 'is_cached',\n", " 'join',\n", " 'limit',\n", " 'localCheckpoint',\n", " 'mapInArrow',\n", " 'mapInPandas',\n", " 'na',\n", " 'observe',\n", " 'orderBy',\n", " 'pandas_api',\n", " 'persist',\n", " 'printSchema',\n", " 'randomSplit',\n", " 'rdd',\n", " 'registerTempTable',\n", " 'repartition',\n", " 'repartitionByRange',\n", " 'replace',\n", " 'rollup',\n", " 'sameSemantics',\n", " 'sample',\n", " 'sampleBy',\n", " 'schema',\n", " 'select',\n", " 'selectExpr',\n", " 'semanticHash',\n", " 'show',\n", " 'sort',\n", " 'sortWithinPartitions',\n", " 'sparkSession',\n", " 'sql_ctx',\n", " 'stat',\n", " 'storageLevel',\n", " 'subtract',\n", " 'summary',\n", " 'tail',\n", " 'take',\n", " 'toDF',\n", " 'toJSON',\n", " 'toLocalIterator',\n", " 'toPandas',\n", " 'to_koalas',\n", " 'to_pandas_on_spark',\n", " 'transform',\n", " 'union',\n", " 'unionAll',\n", " 'unionByName',\n", " 'unpersist',\n", " 'where',\n", " 'withColumn',\n", " 'withColumnRenamed',\n", " 'withColumns',\n", " 'withMetadata',\n", " 'withWatermark',\n", " 'write',\n", " 'writeStream',\n", " 'writeTo']" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dir(image1)" ] }, { "cell_type": "code", "execution_count": 7, "id": "75228690", "metadata": {}, "outputs": [], "source": [ "image1.write.format(\"delta\").mode(\"overwrite\").save(\"/tmp/delta/art/file1\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "9ebbfe29", "metadata": {}, "outputs": [], "source": [ "deltaTable = DeltaTable.forPath(spark, \"/tmp/delta/art/file1\")\n", "deltaTable.delete(\"image.height < 2000\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "eaba0815", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+\n", "|height|\n", "+------+\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "| 2480|\n", "+------+\n", "\n" ] } ], "source": [ "dlimage1 = spark.read.format(\"delta\").load(\"/tmp/delta/art/file1\")\n", "dlimage1.select(\"image.height\").show()" ] }, { "cell_type": "code", "execution_count": 5, "id": "f0060f16", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'image2' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdlimage1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munion\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimage2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'image2' is not defined" ] } ], "source": [ "dlimage1.union(image2)" ] }, { "cell_type": "code", "execution_count": null, "id": "91a36724", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f5d8207c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }