一、代碼概述
在windows本地運(yùn)行spark咪鲜,以local模式讀取hive數(shù)據(jù)庫表數(shù)據(jù),實(shí)現(xiàn)的業(yè)務(wù)是計(jì)算經(jīng)緯度之間的距離撞鹉,再將結(jié)果插入一張新表疟丙。在windows本地運(yùn)行,需要下載windows的hadoop模擬環(huán)境鸟雏,使用winutils功能享郊,將hive的配置文件hive-site.xml拷貝到資源目錄,pom文件中指定spark的版本為2.12孝鹊。
二炊琉、下載windows環(huán)境下的hadoop包
- 下載地址:https://github.com/4ttty/winutils,只能下載所有的,沒辦法只下載某一個(gè)版本苔咪,我所使用的版本是hadoop-2.8.3锰悼,將其拷貝到隨便哪一個(gè)目錄,拷貝出目錄地址即可团赏。
三箕般、工程創(chuàng)建
- 使用intellij idea創(chuàng)建maven工程,很簡單舔清,我就不仔細(xì)寫了丝里。
-
將linux環(huán)境上hive的配置文件hive-site.xml拷貝到工程目錄下的resources目錄。
- pom文件配置
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>spark</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<!-- 根據(jù)linux上spark安裝的版本鸠踪,選擇pom文件中spark的版本 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.12</artifactId>
<version>3.0.0</version>
</dependency>
<!-- 計(jì)算經(jīng)緯度之間的距離需要的包 -->
<dependency>
<groupId>org.gavaghan</groupId>
<artifactId>geodesy</artifactId>
<version>1.1.3</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>util.Microseer</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
- 創(chuàng)建結(jié)果表t_sitenumber_distance的對(duì)象
/**
* @author DongJing
* @date 2021/12/9 16:39
*/
public class DistanceMeter {
private String siteNumber;
private double distance;
private int flag;
public String getSiteNumber() {
return siteNumber;
}
public void setSiteNumber(String siteNumber) {
this.siteNumber = siteNumber;
}
public double getDistance() {
return distance;
}
public void setDistance(double distance) {
this.distance = distance;
}
public int getFlag() {
return flag;
}
public void setFlag(int flag) {
this.flag = flag;
}
}
- 通過spark讀取hive庫表
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.gavaghan.geodesy.Ellipsoid;
import org.gavaghan.geodesy.GeodeticCalculator;
import org.gavaghan.geodesy.GeodeticCurve;
import org.gavaghan.geodesy.GlobalCoordinates;
/**
* @author DongJing
* @date 2021/12/9 16:39
*/
public class SparkSqlTest {
public static void main(String[] args) {
//windows環(huán)境下模擬hadoop環(huán)境,linux環(huán)境下可注釋掉此行代碼复斥,不注釋也沒有問題
System.setProperty("hadoop.home.dir","E:\\git\\spark\\hadoop-2.8.3");
//獲取sparksession連接
SparkSession spark = SparkSession
.builder()
.appName("HiveSupport")
.master("local")
.enableHiveSupport()
.getOrCreate();
spark.sql("show databases").show();
spark.sql("use sitelight");
spark.sql("show tables").show();
Dataset<Row> rowDataset = spark.sql("select t1.site_number, t1.longitude j1,t2.longitude j2,t1.latitude w1,t2.latitude w2 " +
"from t_site_formal t1 inner join geo_site_info t2 on t1.site_number = t2.number where t1.del_flag=0 and t1.sign=0");
Encoder<DistanceMeter> rowEncoder = Encoders.bean(DistanceMeter.class);
//通過map拆分并組裝數(shù)據(jù)营密,返回DistanceMeter對(duì)象
Dataset<DistanceMeter> distanceMeterDataset = rowDataset.map((MapFunction<Row,DistanceMeter>) row->{
DistanceMeter distanceMeter = new DistanceMeter();
distanceMeter.setSiteNumber(row.get(0).toString());
Double j1 = Double.valueOf(row.get(1).toString());
Double j2 = Double.valueOf(row.get(2).toString());
Double w1 = Double.valueOf(row.get(3).toString());
Double w2 = Double.valueOf(row.get(4).toString());
GlobalCoordinates source = new GlobalCoordinates(j1, w1);
GlobalCoordinates target = new GlobalCoordinates(j2, w2);
double distance = getDistanceMeter(source,target,Ellipsoid.Sphere);
int flag = distance<=500?0:1;
distanceMeter.setDistance(distance);
distanceMeter.setFlag(flag);
return distanceMeter;
}, rowEncoder);
//將數(shù)據(jù)集注冊成一個(gè)臨時(shí)表,通過sparksql執(zhí)行插入操作
distanceMeterDataset.registerTempTable("tmp");
spark.sql("INSERT INTO t_sitenumber_distance SELECT siteNumber, flag, distance FROM tmp");
spark.close();
}
/**
* 經(jīng)緯度距離計(jì)算
*
* @param gpsFrom
* @param gpsTo
* @param ellipsoid
* @return
*/
public static double getDistanceMeter(GlobalCoordinates gpsFrom, GlobalCoordinates gpsTo, Ellipsoid ellipsoid){
//創(chuàng)建GeodeticCalculator目锭,調(diào)用計(jì)算方法评汰,傳入坐標(biāo)系、經(jīng)緯度用于計(jì)算距離
GeodeticCurve geoCurve = new GeodeticCalculator().calculateGeodeticCurve(ellipsoid, gpsFrom, gpsTo);
return geoCurve.getEllipsoidalDistance();
}
}
四痢虹、我遇到的問題
- main方法執(zhí)行時(shí)遇到的問題
Exception in thread "main" java.lang.IllegalArgumentException: java.net.UnknownHostException: hadoop01
at org.apache.hadoop.security.SecurityUtil.buildTokenService(SecurityUtil.java:378)
at org.apache.hadoop.hdfs.NameNodeProxies.createNonHAProxy(NameNodeProxies.java:320)
at org.apache.hadoop.hdfs.NameNodeProxies.createProxy(NameNodeProxies.java:176)
at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:678)
at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:619)
at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:149)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2669)
at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
at org.apache.spark.sql.hive.execution.SaveAsHiveFile.getStagingDir(SaveAsHiveFile.scala:218)
at org.apache.spark.sql.hive.execution.SaveAsHiveFile.getStagingDir$(SaveAsHiveFile.scala:213)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.getStagingDir(InsertIntoHiveTable.scala:68)
at org.apache.spark.sql.hive.execution.SaveAsHiveFile.getExternalScratchDir(SaveAsHiveFile.scala:210)
at org.apache.spark.sql.hive.execution.SaveAsHiveFile.newVersionExternalTempPath(SaveAsHiveFile.scala:192)
at org.apache.spark.sql.hive.execution.SaveAsHiveFile.getExternalTmpPath(SaveAsHiveFile.scala:131)
at org.apache.spark.sql.hive.execution.SaveAsHiveFile.getExternalTmpPath$(SaveAsHiveFile.scala:100)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.getExternalTmpPath(InsertIntoHiveTable.scala:68)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable.run(InsertIntoHiveTable.scala:98)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:120)
at org.apache.spark.sql.Dataset.$anonfun$logicalPlan$1(Dataset.scala:229)
at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:229)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:100)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:97)
at org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:606)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:601)
at SparkSqlTest.main(SparkSqlTest.java:44)
Caused by: java.net.UnknownHostException: hadoop01
... 40 more
報(bào)錯(cuò)原因:由于要讀取hadoop環(huán)境上存在的數(shù)據(jù)被去,windows環(huán)境上無法識(shí)別,所以需要配置映射奖唯。
解決方法:在C:\Windows\System32\drivers\etc目錄下配置hosts惨缆,在最后一行添加172.16.100.26 hadoop01。
五丰捷、成功執(zhí)行
-
main方法運(yùn)行結(jié)果成功執(zhí)行坯墨。
- 在linux服務(wù)器上,進(jìn)入hive客戶端或者h(yuǎn)adoop的UI界面病往,驗(yàn)證執(zhí)行結(jié)果捣染。